提交 923b1887 编写于 作者: T Tao Luo

Merge branch 'develop' into memory_load

test=develop
...@@ -166,6 +166,8 @@ function(op_library TARGET) ...@@ -166,6 +166,8 @@ function(op_library TARGET)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
else() else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif() endif()
......
...@@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non ...@@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......
add_subdirectory(memory) add_subdirectory(memory)
add_subdirectory(platform) add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
add_subdirectory(imperative)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(string) add_subdirectory(string)
add_subdirectory(recordio) add_subdirectory(recordio)
......
...@@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) ...@@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache) shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
...@@ -191,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry ...@@ -191,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
......
...@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA
if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
#endif
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
// Wait input done // Wait input done
......
...@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
auto multi_devices_pass = AppendPass("multi_devices_pass"); auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_); &strategy_);
multi_devices_pass->Set<int>("num_trainers",
new int(strategy_.num_trainers_));
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
......
...@@ -133,6 +133,7 @@ static const char kPlaces[] = "places"; ...@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
static const char kParams[] = "params"; static const char kParams[] = "params";
static const char kLocalScopes[] = "local_scopes"; static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy"; static const char kStrategy[] = "strategy";
static const char kNumTrainers[] = "num_trainers";
void MultiDevSSAGraphBuilder::Init() const { void MultiDevSSAGraphBuilder::Init() const {
all_vars_.clear(); all_vars_.clear();
...@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
auto nodes = graph->ReleaseNodes(); auto nodes = graph->ReleaseNodes();
ir::Graph &result = *graph; ir::Graph &result = *graph;
int num_trainers = Get<int>(kNumTrainers);
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->IsVar() && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
...@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
if (!is_forwarding && places_.size() > 1) { if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. // broadcast, and each gradient is only broadcast once.
if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr( if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
...@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass, ...@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
.RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kParams)
.RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kLocalScopes)
.RequirePassAttr(paddle::framework::details::kStrategy); .RequirePassAttr(paddle::framework::details::kStrategy)
.RequirePassAttr(paddle::framework::details::kNumTrainers);
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, ...@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
return tensor; return tensor;
} }
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
Variable* var = scope.FindVar(var_name);
PADDLE_ENFORCE(var, "%s no in scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
return *var->GetMutable<LoDTensor>();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, ...@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index); size_t index);
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -38,8 +38,7 @@ void CheckProgram(const ProgramDesc &program) { ...@@ -38,8 +38,7 @@ void CheckProgram(const ProgramDesc &program) {
switch (role_id) { switch (role_id) {
case _INT(OpRole::kForward): case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR) LOG(ERROR) << "Cannot add backward operator before forward operator "
<< "Cannot add backward operator before forward operator %s."
<< op->Type(); << op->Type();
} }
break; break;
......
...@@ -177,14 +177,13 @@ class Graph { ...@@ -177,14 +177,13 @@ class Graph {
return nullptr; return nullptr;
} }
const ProgramDesc &program() const { return program_; }
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
void ResolveHazard( void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes); const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private: private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_kernel_type.h"
namespace paddle {
namespace framework {
size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
int cur_loc = 0;
int place = key.place_.which();
cur_loc += OpKernelType::kPlaceBits;
int data_type = static_cast<int>(key.data_type_) << cur_loc;
cur_loc += OpKernelType::kPrimaryDTypeBits;
int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
cur_loc += OpKernelType::kLayoutBits;
int library_type = static_cast<int>(key.library_type_) << cur_loc;
cur_loc += OpKernelType::kLibBits;
int customized_value = key.customized_type_value_;
PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
customized_value = customized_value << cur_loc;
cur_loc += OpKernelType::kCustomizeBits;
PADDLE_ENFORCE(cur_loc < 64);
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type +
customized_value);
}
bool OpKernelType::operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_ &&
customized_type_value_ == o.customized_type_value_;
}
} // namespace framework
} // namespace paddle
...@@ -24,54 +24,55 @@ limitations under the License. */ ...@@ -24,54 +24,55 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct OpKernelType { class OpKernelType {
struct Hash { public:
size_t operator()(const OpKernelType& key) const { constexpr static int kDefaultCustomizedTypeValue = 0;
int place = key.place_.which();
int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
int library_type = static_cast<int>(key.library_type_)
<< (LEFT_SHIFT * 3);
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type);
}
};
// place, data_type, library_type kinds less than 2^8 // In total should be smaller than 64.
constexpr static int LEFT_SHIFT = 8; constexpr static int kPlaceBits = 4;
constexpr static int kPrimaryDTypeBits = 8;
proto::VarType::Type data_type_; constexpr static int kLayoutBits = 4;
DataLayout data_layout_; constexpr static int kLibBits = 4;
platform::Place place_; constexpr static int kCustomizeBits = 4;
LibraryType library_type_;
OpKernelType(proto::VarType::Type data_type, platform::Place place, OpKernelType(proto::VarType::Type data_type, platform::Place place,
DataLayout data_layout = DataLayout::kAnyLayout, DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain) LibraryType library_type = LibraryType::kPlain,
int customized_type_value = kDefaultCustomizedTypeValue)
: data_type_(data_type), : data_type_(data_type),
data_layout_(data_layout), data_layout_(data_layout),
place_(place), place_(place),
library_type_(library_type) {} library_type_(library_type),
customized_type_value_(customized_type_value) {}
OpKernelType(proto::VarType::Type data_type, OpKernelType(proto::VarType::Type data_type,
const platform::DeviceContext& dev_ctx, const platform::DeviceContext& dev_ctx,
DataLayout data_layout = DataLayout::kAnyLayout, DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain) LibraryType library_type = LibraryType::kPlain,
int customized_type_value = kDefaultCustomizedTypeValue)
: data_type_(data_type), : data_type_(data_type),
data_layout_(data_layout), data_layout_(data_layout),
place_(dev_ctx.GetPlace()), place_(dev_ctx.GetPlace()),
library_type_(library_type) {} library_type_(library_type),
customized_type_value_(customized_type_value) {}
virtual ~OpKernelType() {}
struct Hash {
size_t operator()(const OpKernelType& key) const;
};
size_t hash_key() const { return Hash()(*this); } size_t hash_key() const { return Hash()(*this); }
bool operator==(const OpKernelType& o) const { bool operator==(const OpKernelType& o) const;
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_;
}
bool operator!=(const OpKernelType& o) const { return !(*this == o); } bool operator!=(const OpKernelType& o) const { return !(*this == o); }
proto::VarType::Type data_type_;
DataLayout data_layout_;
platform::Place place_;
LibraryType library_type_;
int customized_type_value_;
}; };
inline std::ostream& operator<<(std::ostream& os, inline std::ostream& operator<<(std::ostream& os,
......
...@@ -35,6 +35,7 @@ limitations under the License. */ ...@@ -35,6 +35,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Registrar { class Registrar {
public: public:
// In our design, various kinds of classes, e.g., operators and kernels, // In our design, various kinds of classes, e.g., operators and kernels,
...@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor; ...@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor;
template <typename PlaceType, typename T, typename Func> template <typename PlaceType, typename T, typename Func>
inline void RegisterKernelClass(const char* op_type, const char* library_type, inline void RegisterKernelClass(const char* op_type, const char* library_type,
Func func) { int customized_type_value, Func func) {
std::string library(library_type); std::string library(library_type);
std::string data_layout = "ANYLAYOUT"; std::string data_layout = "ANYLAYOUT";
if (library == "MKLDNN") { if (library == "MKLDNN") {
...@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type, ...@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type,
} }
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
StringToDataLayout(data_layout), StringToDataLayout(data_layout),
StringToLibraryType(library_type)); StringToLibraryType(library_type), customized_type_value);
OperatorWithKernel::AllOpKernels()[op_type][key] = func; OperatorWithKernel::AllOpKernels()[op_type][key] = func;
} }
...@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
using KERNEL_TYPE = using KERNEL_TYPE =
typename std::tuple_element<I, std::tuple<KernelTypes...>>::type; typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
RegisterKernelClass<PlaceType, T>( RegisterKernelClass<PlaceType, T>(
op_type, library_type, [](const framework::ExecutionContext& ctx) { op_type, library_type, customized_type_value,
[](const framework::ExecutionContext& ctx) {
KERNEL_TYPE().Compute(ctx); KERNEL_TYPE().Compute(ctx);
}); });
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...> OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
template <typename PlaceType, size_t I, typename... KernelType> template <typename PlaceType, size_t I, typename... KernelType>
struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
void operator()(const char* op_type, const char* library_type) const {} void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {}
}; };
// User can register many kernel in one place. The data type could be // User can register many kernel in one place. The data type could be
...@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { ...@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
template <typename PlaceType, typename... KernelType> template <typename PlaceType, typename... KernelType>
class OpKernelRegistrar : public Registrar { class OpKernelRegistrar : public Registrar {
public: public:
explicit OpKernelRegistrar(const char* op_type, const char* library_type) { explicit OpKernelRegistrar(const char* op_type, const char* library_type,
int customized_type_value) {
OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func; OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
...@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx; ...@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx;
template <typename PlaceType, typename... DataTypeAndKernelType> template <typename PlaceType, typename... DataTypeAndKernelType>
class OpKernelRegistrarEx : public Registrar { class OpKernelRegistrarEx : public Registrar {
public: public:
explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) { explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
int customized_type_value) {
OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...> OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType> template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
struct OpKernelRegistrarFunctorEx<PlaceType, true, I, struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
DataTypeAndKernelType...> { DataTypeAndKernelType...> {
void operator()(const char* op_type, const char* library_type) const {} void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {}
}; };
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType> template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
...@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
typename std::tuple_element<I, typename std::tuple_element<I,
std::tuple<DataTypeAndKernelType...>>::type; std::tuple<DataTypeAndKernelType...>>::type;
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type,
RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor()); int customized_type_value) const {
RegisterKernelClass<PlaceType, T>(op_type, library_type,
customized_type_value, Functor());
constexpr auto size = constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value; std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2, OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
DataTypeAndKernelType...> DataTypeAndKernelType...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
// clang-format off
/** /**
* check if MACRO is used in GLOBAL NAMESPACE. * check if MACRO is used in GLOBAL NAMESPACE.
*/ */
...@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
/** /**
* Macro to register OperatorKernel. * Macro to register OperatorKernel.
*/ */
#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \
place_class, customized_name, \
customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##library_type##__, \ __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \ "REGISTER_OP_KERNEL must be called in " \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \ "global namespace"); \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ static ::paddle::framework::OpKernelRegistrar<place_class, \
#library_type); \ __VA_ARGS__> \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \ __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ #op_type, #library_type, customized_type_value); \
int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
__op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \
.Touch(); \
return 0; \ return 0; \
} }
#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \
op_type, library_type, place_class, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##library_type##__, \ __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
"REGISTER_OP_KERNEL_EX must be called in global namespace"); \ "REGISTER_OP_KERNEL_EX must be called in " \
static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \ "global namespace"); \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ static ::paddle::framework::OpKernelRegistrarEx<place_class, \
#library_type); \ __VA_ARGS__> \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \ __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ #op_type, #library_type, customized_type_value); \
int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
__op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \
.Touch(); \
return 0; \ return 0; \
} }
#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \ REGISTER_OP_KERNEL_EX( \
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL_EX( \
op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
...@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
extern int TouchOpRegistrar_##op_type(); \ extern int TouchOpRegistrar_##op_type(); \
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type() UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
LIBRARY_TYPE, \
customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__, \
"USE_OP_DEVICE_KERNEL must be in global namespace"); \ "USE_OP_DEVICE_KERNEL must be in global namespace"); \
extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \ extern int \
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \ TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
// TODO(fengjiayi): The following macros // TODO(fengjiayi): The following macros
// seems ugly, do we have better method? // seems ugly, do we have better method?
...@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define USE_OP(op_type) \ #define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \ USE_OP_ITSELF(op_type); \
USE_OP_KERNEL(op_type) USE_OP_KERNEL(op_type)
// clang-format off
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
AddInput("input", "input of test op"); AddInput("input", "input of test op");
AddOutput("output", "output of test op"); AddOutput("output", "output of test op");
AddAttr<float>("scale", "scale of cosine op"); AddAttr<float>("scale", "scale of cosine op");
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
...@@ -95,6 +97,8 @@ TEST(OperatorBase, all) { ...@@ -95,6 +97,8 @@ TEST(OperatorBase, all) {
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static int special_type_value = 1;
class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
...@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
static int cpu_kernel_run_num = 0; static int cpu_kernel_run_num = 0;
static int cpu_kernel2_run_num = 0;
class OpWithKernelTest : public OperatorWithKernel { class OpWithKernelTest : public OperatorWithKernel {
public: public:
...@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetExpectedKernelType( OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override { const ExecutionContext& ctx) const override {
return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); int sub_type = ctx.Attr<int>("kernel_sub_type");
return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
framework::DataLayout::kAnyLayout,
framework::LibraryType::kPlain, sub_type);
} }
}; };
...@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> { ...@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> {
} }
}; };
template <typename T1, typename T2>
class CPUKernel2Test : public OpKernel<float> {
public:
void Compute(const ExecutionContext& ctx) const {
std::cout << ctx.op().DebugString() << std::endl;
cpu_kernel2_run_num++;
ASSERT_EQ(ctx.op().Input("x"), "IN1");
ASSERT_EQ(ctx.op().Output("y"), "OUT1");
}
};
class OpKernelTestMultiInputsProtoAndCheckerMaker class OpKernelTestMultiInputsProtoAndCheckerMaker
: public OpProtoAndCheckerMaker { : public OpProtoAndCheckerMaker {
public: public:
...@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker ...@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
...@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> { ...@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_WITHOUT_GRADIENT(
op_with_kernel, paddle::framework::OpWithKernelTest, op_with_kernel, paddle::framework::OpWithKernelTest,
paddle::framework::OpKernelTestProtoAndCheckerMaker); paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(op_with_kernel, REGISTER_OP_CPU_KERNEL(op_with_kernel,
paddle::framework::CPUKernelTest<float, float>); paddle::framework::CPUKernelTest<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
paddle::framework::special_type_value,
paddle::framework::CPUKernel2Test<float, float>);
// test with single input // test with single input
TEST(OpKernel, all) { TEST(OpKernel, all) {
paddle::framework::InitDevices(true); paddle::framework::InitDevices(true);
...@@ -211,7 +240,19 @@ TEST(OpKernel, all) { ...@@ -211,7 +240,19 @@ TEST(OpKernel, all) {
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_place); op->Run(scope, cpu_place);
// kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
attr = op_desc.mutable_attrs()->Add();
attr->set_name("kernel_sub_type");
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(1);
auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
op2->Run(scope, cpu_place);
// kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
} }
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_WITHOUT_GRADIENT(
......
cc_library(layer SRCS layer.cc DEPS proto_desc operator)
cc_library(tracer SRCS tracer.cc DEPS proto_desc)
cc_library(engine SRCS engine.cc)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/engine.h"
#include <mutex> // NOLINT
#include <vector>
#include "glog/logging.h"
namespace paddle {
namespace imperative {
static std::once_flag init_engine;
static Engine* engine;
class DummyEngine : public Engine {
public:
void Enqueue(Runnable* runnable) override {
queued_runnables_.push_back(runnable);
}
size_t Size() const override { return queued_runnables_.size(); }
void Sync() override {
for (Runnable* l : queued_runnables_) {
LOG(INFO) << "running " << reinterpret_cast<void*>(l);
}
queued_runnables_.clear();
}
private:
std::vector<Runnable*> queued_runnables_;
};
Engine* GetEngine() {
std::call_once(init_engine, []() { engine = new DummyEngine(); });
return engine;
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <cstdint>
namespace paddle {
namespace imperative {
struct Runnable {};
class Engine {
public:
virtual ~Engine() {}
virtual void Enqueue(Runnable* runnable) = 0;
virtual size_t Size() const = 0;
virtual void Sync() = 0;
};
Engine* GetEngine();
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/layer.h"
#include <deque>
#include <limits>
#include <map>
#include <random>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace imperative {
using framework::Variable;
void AddTo(Variable* src, Variable* dst) {
framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
dst_tensor->numel(), src_tensor->numel());
float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
const float* src_data = src_tensor->data<float>();
for (size_t i = 0; i < src_tensor->numel(); ++i) {
dst_data[i] += src_data[i];
}
}
class Autograd {
public:
explicit Autograd(framework::Scope* scope) : scope_(scope) {}
void RunBackward(VarBase* var) {
PADDLE_ENFORCE(var->pre_op_->op_desc_);
// TODO(panyx0718): Only create for vars that "require_grad"
(*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
std::deque<OpBase*> ready;
ready.push_back(var->pre_op_);
std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
for (size_t i = 0; i < input_grads.size(); ++i) {
if (!input_grads[i]) continue;
OpBase* pre_op = ready_op->pre_ops_->at(i);
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
}
}
}
}
private:
std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
std::map<OpBase*, int> ret;
std::deque<OpBase*> queue;
queue.push_back(op);
std::unordered_set<OpBase*> visited;
visited.insert(op);
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
for (OpBase* pre_op : *(candidate->pre_ops_)) {
if (!pre_op) continue;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
}
ret[pre_op] += 1;
}
}
return ret;
}
framework::Scope* scope_;
};
framework::Variable* CreateVariable(const std::string& name,
const framework::DDim& dim, float val,
framework::Scope* scope,
bool random_name = true) {
std::string varname = name;
if (random_name) {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int>::max());
int id = dist6(rng);
varname = string::Sprintf("%s@%d", varname, id);
}
VLOG(3) << "creating var " << varname;
framework::Variable* var = scope->Var(varname);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
std::fill(data, data + tensor->numel(), val);
return var;
}
framework::LoDTensor& VarBase::Grad() {
VLOG(3) << "get var grad " << var_desc_->Name();
return *grads_->GetMutable<framework::LoDTensor>();
}
void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
VLOG(3) << "apply var grad " << var_desc_->Name() << " "
<< grad->Get<framework::LoDTensor>().data<float>()[0];
if (!grads_) {
grads_ =
CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
}
AddTo(grad, grads_);
VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
<< grads_->Get<framework::LoDTensor>().data<float>()[0];
}
std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
VLOG(3) << "op grad " << grad_op_desc_->Type();
for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
// grad op inputs can be forward inputs, so not in grad_to_var.
continue;
}
VLOG(3) << "op grad in var " << grad_invar;
block_->FindRecursiveOrCreateVar(grad_invar);
framework::Variable* var = scope->Var(grad_invar);
const std::string& invar = grad_to_var_->at(grad_invar);
for (VarBase* varbase : *output_vars_) {
// Use the accumulated grads_ by sharing the input with grads_.
if (varbase->var_desc_->Name() == invar) {
var->GetMutable<framework::LoDTensor>()->ShareDataWith(
varbase->grads_->Get<framework::LoDTensor>());
break;
}
}
}
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
VLOG(3) << "grad outvar " << outvar;
block_->FindRecursiveOrCreateVar(outvar);
framework::Variable* var = scope->Var(outvar);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block_->FindVar(outvar);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
}
grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
opbase->Run(*scope, platform::CPUPlace());
// `ret` matches exactly with `input_vars_` of forward op.
std::vector<Variable*> ret;
for (size_t i = 0; i < input_vars_->size(); ++i) {
bool found = false;
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
Variable* var = scope->FindVar(outvar);
VarBase* origin_var = (*input_vars_)[i];
std::string orig_var = grad_to_var_->at(outvar);
PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
origin_var->ApplyGrad(scope, var);
found = true;
ret.push_back(var);
// TODO(panyx0718): There might be another outvar with the same name.
// In that case, it doesn't matter the first one or the second one is
// used.
break;
}
if (!found) {
ret.push_back(nullptr);
}
}
return ret;
}
void VarBase::RunBackward(framework::Scope* scope) {
grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
false);
if (!pre_op_) return;
Autograd(scope).RunBackward(this);
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace imperative {
class OpBase;
class VarBase {
public:
VarBase()
: pre_op_(nullptr),
pre_op_out_idx_(-1),
var_desc_(nullptr),
var_(nullptr),
grads_(nullptr) {}
virtual ~VarBase() {}
void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
void RunBackward(framework::Scope* scope);
framework::LoDTensor& Grad();
OpBase* pre_op_;
int pre_op_out_idx_;
framework::VarDesc* var_desc_;
framework::Variable* var_;
framework::Variable* grads_;
};
class OpBase {
public:
OpBase()
: input_vars_(new std::vector<VarBase*>()),
output_vars_(new std::vector<VarBase*>()),
pre_ops_(new std::vector<OpBase*>()),
pre_ops_out_idx_(new std::vector<int>()),
op_desc_(nullptr),
grad_op_desc_(nullptr) {}
virtual ~OpBase() {
delete input_vars_;
delete output_vars_;
delete pre_ops_;
delete pre_ops_out_idx_;
if (grad_op_desc_) delete grad_op_desc_;
if (grad_to_var_) delete grad_to_var_;
}
std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
std::vector<VarBase*>* input_vars_;
std::vector<VarBase*>* output_vars_;
std::vector<OpBase*>* pre_ops_;
std::vector<int>* pre_ops_out_idx_;
framework::OpDesc* op_desc_;
framework::OpDesc* grad_op_desc_;
std::unordered_map<std::string, std::string>* grad_to_var_;
framework::BlockDesc* block_;
};
class Layer {
public:
virtual ~Layer() {}
virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
std::vector<VarBase> vars;
return vars;
}
virtual void Backward() { LOG(ERROR) << "To support customize"; }
};
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/tracer.h"
namespace paddle {
namespace imperative {} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h"
namespace paddle {
namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var) {
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
// TODO(panyx0718): Leak?
*grad_op_desc = grad_op_descs[0].release();
}
class Tracer {
public:
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
root_scope_ = new framework::Scope();
scopes_[root_block_] = root_scope_;
}
virtual ~Tracer() { delete root_scope_; }
void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
const std::vector<VarBase*>& outputs,
framework::BlockDesc* block) {
framework::Scope* scope = GetScope(block);
framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type();
op_desc->InferShape(*block);
op_desc->InferVarType(block);
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc);
*op->input_vars_ = inputs;
for (VarBase* input : inputs) {
const std::string vname = input->var_desc_->Name();
framework::Variable* var = scope->Var(vname);
input->var_ = var;
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
if (input->pre_op_) {
op->pre_ops_->push_back(input->pre_op_);
op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
} else {
op->pre_ops_->push_back(nullptr);
}
}
*op->output_vars_ = outputs;
for (size_t i = 0; i < outputs.size(); ++i) {
const std::string vname = outputs[i]->var_desc_->Name();
framework::Variable* var = scope->Var(vname);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
outputs[i]->var_ = var;
outputs[i]->pre_op_ = op;
outputs[i]->pre_op_out_idx_ = i;
}
op_base->Run(*scope, platform::CPUPlace());
framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
op->grad_op_desc_ = grad_op_desc;
op->grad_to_var_ = grad_to_var;
op->block_ = block;
}
framework::Scope* GetScope(framework::BlockDesc* block) {
if (scopes_.find(block) != scopes_.end()) {
return scopes_.at(block);
}
framework::BlockDesc* parent_block = block->ParentBlock();
PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
framework::Scope* scope = &scopes_[parent_block]->NewScope();
scopes_[block] = scope;
return scope;
}
private:
std::map<framework::BlockDesc*, framework::Scope*> scopes_;
framework::BlockDesc* root_block_;
framework::Scope* root_scope_;
};
} // namespace imperative
} // namespace paddle
...@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
*block_desc.Proto()->mutable_vars() = auto *vars = block_desc.Proto()->mutable_vars();
const_cast<framework::ProgramDesc *>(&graph->program()) for (framework::ir::Node *node : graph->Nodes()) {
->Proto() if (node->IsVar() && node->Var()) {
->blocks(0) *vars->Add() = *node->Var()->Proto();
.vars(); }
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
......
...@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) { ...@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// USE_OP(prelu); USE_OP(prelu);
USE_CPU_ONLY_OP(prelu);
nv_library(tensorrt_plugin nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
avg_pool_op_plugin.cu avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine) DEPS enforce tensorrt_engine prelu)
...@@ -14,92 +14,16 @@ ...@@ -14,92 +14,16 @@
#include <stdio.h> #include <stdio.h>
#include <cassert> #include <cassert>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
float *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float *out = output + offset;
float scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
const float *scale = alpha + offset;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
}
}
__global__ void PReluScalarKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float scale = *alpha;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
static inline void PReluChannelWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, dims.d[0], spatial_size);
}
static inline void PReluElementWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
static inline void PReluScalar(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size, const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims, const nvinfer1::Dims *inputDims,
int nbInputs) { int nbInputs) {
...@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, ...@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
return output_dims; return output_dims;
} }
int PReluPlugin::enqueue(int batchSize, const void *const *inputs, int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) { void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW. // input dims is CHW.
const auto &input_dims = this->getInputDims(0); const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]); const float *input = reinterpret_cast<const float *>(inputs[0]);
const float *alpha = reinterpret_cast<const float *>(alpha_.get().values); const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
float *output = reinterpret_cast<float **>(outputs)[0]; float *output = reinterpret_cast<float **>(outputs)[0];
std::vector<int> input_shape;
input_shape.push_back(batch_size);
for (int i = 0; i < input_dims.nbDims; i++) {
input_shape.push_back(input_dims.d[i]);
}
if (mode_ == "channel") { if (mode_ == "channel") {
PReluChannelWise(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluChannelWiseDirectCUDAFunctor<float>
prelu_channel_wise;
prelu_channel_wise(stream, input, alpha, output, input_shape);
} else if (mode_ == "element") { } else if (mode_ == "element") {
PReluElementWise(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluElementWiseDirectCUDAFunctor<float>
prelu_element_wise;
prelu_element_wise(stream, input, alpha, output, input_shape);
} else { } else {
PReluScalar(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
prelu_scalar(stream, input, alpha, output, input_shape);
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
......
...@@ -14,11 +14,13 @@ ...@@ -14,11 +14,13 @@
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string> #include <string>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
DEFINE_bool(init_allocated_mem, false, DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
...@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { ...@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag; static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr; static detail::BuddyAllocator **a_arr = nullptr;
static std::vector<int> devices;
std::call_once(init_flag, [gpu_id]() { std::call_once(init_flag, [gpu_id]() {
int gpu_num = platform::GetCUDADeviceCount(); devices = platform::GetSelectedDevices();
PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, int gpu_num = devices.size();
gpu_num);
a_arr = new BuddyAllocator *[gpu_num]; a_arr = new BuddyAllocator *[gpu_num];
for (int i = 0; i < gpu_num; i++) { for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr; a_arr[i] = nullptr;
platform::SetDeviceId(i); platform::SetDeviceId(dev_id);
a_arr[i] = new BuddyAllocator( a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)), new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use " VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100 << FLAGS_fraction_of_gpu_memory_to_use * 100
...@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { ...@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
}); });
platform::SetDeviceId(gpu_id); platform::SetDeviceId(gpu_id);
return a_arr[gpu_id]; auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
} }
#endif #endif
......
...@@ -70,7 +70,7 @@ endif() ...@@ -70,7 +70,7 @@ endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif() endif()
# FIXME(typhoonzero): operator deps may not needed. # FIXME(typhoonzero): operator deps may not needed.
......
...@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, ...@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
} }
#endif #endif
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()), framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
ctx.GetPlace(), layout, library); library);
} }
class ActivationOp : public framework::OperatorWithKernel { class ActivationOp : public framework::OperatorWithKernel {
......
...@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = { ...@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = {
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
}; };
/* The following operator can be used to process SelectedRows, because the
* output of those operator for zero is zero too.
*/
static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
"abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
template <typename DeviceContext, typename Functor> template <typename DeviceContext, typename Functor>
...@@ -50,16 +56,38 @@ class ActivationKernel ...@@ -50,16 +56,38 @@ class ActivationKernel
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto& X = detail::Ref(context.Input<framework::Tensor>("X"), auto x_var = context.InputVar("X");
"Cannot get input tensor X, variable name = %s", auto out_var = context.OutputVar("Out");
PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input Variable X, variable name = %s",
context.op().Input("X")); context.op().Input("X"));
PADDLE_ENFORCE(out_var != nullptr,
"Cannot get output Variable Out, variable name = %s",
context.op().Output("Out"));
framework::Tensor X, *Out;
auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"), if (CanBeUsedBySelectedRows.count(context.op().Type())) {
X = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
"Cannot get input Tensor X, variable name = %s",
context.op().Input("X"));
Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
out_var);
} else {
X = detail::Ref(context.Input<framework::Tensor>("X"),
"Cannot get input Tensor X, variable name = %s",
context.op().Input("X"));
Out = context.Output<framework::Tensor>("Out");
}
PADDLE_ENFORCE(Out != nullptr,
"Cannot get output tensor Out, variable name = %s", "Cannot get output tensor Out, variable name = %s",
context.op().Output("Out")); context.op().Output("Out"));
Out.mutable_data<T>(context.GetPlace());
Out->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(X); auto x = framework::EigenVector<T>::Flatten(X);
auto out = framework::EigenVector<T>::Flatten(Out); auto out = framework::EigenVector<T>::Flatten(*Out);
auto* place = auto* place =
context.template device_context<DeviceContext>().eigen_device(); context.template device_context<DeviceContext>().eigen_device();
Functor functor; Functor functor;
...@@ -78,14 +106,54 @@ class ActivationGradKernel ...@@ -78,14 +106,54 @@ class ActivationGradKernel
public: public:
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* Out = context.Input<framework::Tensor>("Out"); auto out_var = context.InputVar("Out");
auto* dOut = auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
context.Input<framework::Tensor>(framework::GradVarName("Out")); auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); PADDLE_ENFORCE(out_var != nullptr,
"Cannot get input Variable Out, variable name = %s",
context.op().Input("Out"));
PADDLE_ENFORCE(out_grad_var != nullptr,
"Cannot get input Variable %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
PADDLE_ENFORCE(x_grad_var != nullptr,
"Cannot get output Variable %s, variable name = %s",
framework::GradVarName("X"),
context.op().Output(framework::GradVarName("X")));
framework::Tensor Out, dOut, *dX;
if (CanBeUsedBySelectedRows.count(context.op().Type())) {
Out = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
"Cannot get input Tensor Out, variable name = %s",
context.op().Input("Out"));
dOut =
detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
*out_grad_var),
"Cannot get input Tensor %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
x_grad_var);
} else {
Out = detail::Ref(context.Input<framework::Tensor>("Out"),
"Cannot get input Tensor Out, variable name = %s",
context.op().Input("Out"));
dOut = detail::Ref(
context.Input<framework::Tensor>(framework::GradVarName("Out")),
"Cannot get input Tensor %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
}
PADDLE_ENFORCE(dX != nullptr,
"Cannot get output tensor %s, variable name = %s",
framework::GradVarName("X"),
context.op().Output(framework::GradVarName("X")));
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto dout = framework::EigenVector<T>::Flatten(*dOut); auto dout = framework::EigenVector<T>::Flatten(dOut);
auto out = framework::EigenVector<T>::Flatten(*Out); auto out = framework::EigenVector<T>::Flatten(Out);
auto dx = framework::EigenVector<T>::Flatten(*dX); auto dx = framework::EigenVector<T>::Flatten(*dX);
auto* place = auto* place =
context.template device_context<DeviceContext>().eigen_device(); context.template device_context<DeviceContext>().eigen_device();
...@@ -96,8 +164,19 @@ class ActivationGradKernel ...@@ -96,8 +164,19 @@ class ActivationGradKernel
} }
bool inplace = functor.Inplace(); bool inplace = functor.Inplace();
if (!inplace) { if (!inplace) {
auto* X = context.Input<framework::Tensor>("X"); auto x_var = context.InputVar("X");
auto x = framework::EigenVector<T>::Flatten(*X); PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
framework::Tensor X;
if (CanBeUsedBySelectedRows.count(context.op().Type())) {
X = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
} else {
X = detail::Ref(context.Input<framework::Tensor>("X"));
}
auto x = framework::EigenVector<T>::Flatten(X);
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} else { } else {
VLOG(10) << " Inplace activation "; VLOG(10) << " Inplace activation ";
......
...@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. ...@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM.
template <typename T> template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) { inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) { if (bias) {
math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y); math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
math::vec_relu<T, platform::jit::avx>(n, y, y); math::vec_relu<T, platform::avx>(n, y, y);
} else { } else {
math::vec_relu<T, platform::jit::avx>(n, x, y); math::vec_relu<T, platform::avx>(n, x, y);
} }
} }
...@@ -245,7 +245,7 @@ inline void vec_softmax(const int n, const T* x, T* y) { ...@@ -245,7 +245,7 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar; scalar = scalar < x[i] ? x[i] : scalar;
} }
math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y); // sub math::vec_add_bias<T, platform::avx>(n, -scalar, x, y); // sub
math::vec_exp<T>(n, y, y); // exp math::vec_exp<T>(n, y, y); // exp
// sum // sum
scalar = T(0); scalar = T(0);
...@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> { ...@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor; math::VecActivations<T, platform::avx> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
} else { } else {
math::VecActivations<T, platform::jit::isa_any> act_functor; math::VecActivations<T, platform::isa_any> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
......
...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(input->dims()); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (activation == "identity") { if (!exhaustive_search) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
...@@ -165,6 +161,29 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -165,6 +161,29 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
if ((activation == "identity") &&
(algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
(!residual)) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
// But test in some case, the speed is slower, change to use
// cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data));
} else {
if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
// ------------------- cudnn conv+bias+act forward -------------------- // ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f; ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
...@@ -178,6 +197,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -178,6 +197,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}; };
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} }
}
}; };
#endif #endif
......
...@@ -491,8 +491,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -491,8 +491,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNGradOpKernel<float>); ops::ConvMKLDNNGradOpKernel<float>);
...@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
...@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
customized_type_value = kConvMKLDNNFP32;
} }
#endif #endif
...@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
} }
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library); library, customized_type_value);
} }
void Conv2DOpMaker::Make() { void Conv2DOpMaker::Make() {
...@@ -342,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -342,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
...@@ -357,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -357,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN;
customized_type_value = kConvMKLDNNFP32;
} }
#endif #endif
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout_, library_); layout_, library_, customized_type_value);
} }
} // namespace operators } // namespace operators
......
...@@ -27,6 +27,8 @@ namespace paddle { ...@@ -27,6 +27,8 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
constexpr int kConvMKLDNNFP32 = 1;
constexpr int kConvMKLDNNINT8 = 2;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
......
...@@ -177,11 +177,19 @@ struct CudnnRNNCache { ...@@ -177,11 +177,19 @@ struct CudnnRNNCache {
seed_)); seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
#if CUDNN_VERSION >= 6000
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
#else
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT));
#endif
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
......
...@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T> ...@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X"); auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input Variable X, variable name = %s",
ctx.op().Input("X"));
auto* y = ctx.Input<framework::LoDTensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
framework::Tensor x, *z;
if (x_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1,
"For elementwise_op, if X is Sparse, Y must be scalar.");
auto& x_sele = x_var->Get<framework::SelectedRows>();
auto out_sele = ctx.Output<framework::SelectedRows>("Out");
x = x_sele.value();
out_sele->set_rows(x_sele.rows());
out_sele->set_height(x_sele.height());
out_sele->mutable_value()->Resize(x_sele.value().dims());
out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
} else if (x_var->IsType<framework::LoDTensor>()) {
x = x_var->Get<framework::LoDTensor>();
z = ctx.Output<framework::LoDTensor>("Out");
} else {
PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
x_var->Type().name());
}
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
if (x->numel() == y->numel()) { if (x.numel() == y->numel()) {
elementwise_mul<DeviceContext, T>(ctx, x, y, z); elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} else { } else {
default_elementwise_mul<DeviceContext, T>(ctx, x, y, z); default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} }
} }
}; };
......
...@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of elementwise op should not be null."); "Output(Out) of elementwise op should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->GetInputsVarType("Y").front() == ctx->GetInputsVarType("Y").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", "The input var's type should be LoDTensor, but the received is %s [%s]",
ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front()); ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
if (ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::LOD_TENSOR) {
auto x_dim = ctx->GetInputDim("X"); auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y"); auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
"Rank of first input must >= rank of second input."); "Rank of first input must >= rank of second input.");
} else if (ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::SELECTED_ROWS) {
PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
(ctx->GetInputDim("Y")[0] == 1),
"For elementwise_op, if X is Sparse, "
"Y must be scalar.");
} else {
PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
ctx->GetInputsVarType("X").front());
}
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
......
...@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \ auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \ auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \ auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \ if (platform::MayIUse(platform::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \ math::VecActivations<T, platform::avx> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
} else { \ } else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \ math::VecActivations<T, platform::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
......
...@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> { ...@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std::function<void(const int, const T*, T*)> fc_act; std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation"); auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor; math::VecActivations<T, platform::avx> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} else { } else {
math::VecActivations<T, platform::jit::isa_any> act_functor; math::VecActivations<T, platform::isa_any> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"GetTensorFromSelectedRowsOp must has input X.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"GetTensorFromSelectedRowsOp must has output Out.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::SELECTED_ROWS,
"The input X's type should be SelectedRows, but the received is %s",
ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
PADDLE_ENFORCE(
ctx->GetOutputsVarType("Out").front() ==
framework::proto::VarType::LOD_TENSOR,
"The output Out's type should be LoDTensor, but the received is %s",
ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front());
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context());
}
};
class GetTensorFromSelectedRowsKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *x = ctx.Input<framework::SelectedRows>("X");
auto *out = ctx.Output<framework::LoDTensor>("Out");
out->Resize(x->value().dims());
out->mutable_data(ctx.GetPlace(), x->value().type());
framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(),
out);
}
};
class GetTensorFromSelectedRowsOpProtoMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input type is SelectedRows.");
AddOutput("Out", "The output type is LoDTensor.");
AddComment(
R"DOC(
GetTensorFromSelectedRows Operator
GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
)DOC");
}
};
class GetTensorFromSelectedRowsOpVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const final {
auto out_var_name = op_desc.Output("Out").front();
auto in_var_name = op_desc.Input("X").front();
auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
out_var.SetType(framework::proto::VarType::LOD_TENSOR);
out_var.SetDataType(in_var.GetDataType());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(get_tensor_from_selected_rows,
ops::GetTensorFromSelectedRowsOp,
ops::GetTensorFromSelectedRowsOpProtoMaker,
ops::GetTensorFromSelectedRowsOpVarTypeInference);
REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
ops::GetTensorFromSelectedRowsKernel, double,
ops::GetTensorFromSelectedRowsKernel, int,
ops::GetTensorFromSelectedRowsKernel, int64_t,
ops::GetTensorFromSelectedRowsKernel);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
ops::GetTensorFromSelectedRowsKernel, double,
ops::GetTensorFromSelectedRowsKernel, int,
ops::GetTensorFromSelectedRowsKernel, int64_t,
ops::GetTensorFromSelectedRowsKernel);
#endif
...@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
"Output(W@Grad should not be null."); "Output(W@Grad should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad should not be null."); "Output(X@Grad should not be null.");
if (!ctx->Attrs().Get<bool>("is_sparse")) {
if (ctx->HasOutput(framework::GradVarName("Bias"))) { if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias")); ctx->GetInputDim("Bias"));
} }
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
}
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
} }
protected: protected:
......
...@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
ctx.Output<framework::SelectedRows>(framework::GradVarName("W")); ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
w_grad->set_rows(real_rows); w_grad->set_rows(real_rows);
// Build a map of id -> row_index to speed up finding the index of one id // Build a map of id -> row_index to speed up finding the index of one id
w_grad->SyncIndex();
w_grad->set_height(w.dims()[0]); w_grad->set_height(w.dims()[0]);
auto* w_grad_value = w_grad->mutable_value(); auto* w_grad_value = w_grad->mutable_value();
framework::DDim temp_dim(w.dims()); framework::DDim temp_dim(w.dims());
......
...@@ -59,6 +59,7 @@ math_library(matrix_bit_code) ...@@ -59,6 +59,7 @@ math_library(matrix_bit_code)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
math_library(prelu)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
......
...@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) { ...@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) {
#endif #endif
// MKL scal only support inplace, choose this if src and dst are not equal // MKL scal only support inplace, choose this if src and dst are not equal
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_scal(const int n, const T a, const T* x, T* y) { inline void vec_scal(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a * x[i]; y[i] = a * x[i];
...@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { ...@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx>(const int n, const float a, inline void vec_scal<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_scal<float, platform::jit::isa_any>(n, a, x, y); vec_scal<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a, ...@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
y[i] = a * x[i]; y[i] = a * x[i];
} }
#else #else
vec_scal<float, platform::jit::isa_any>(n, a, x, y); vec_scal<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx2>(const int n, const float a, inline void vec_scal<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_scal<float, platform::jit::avx>(n, a, x, y); vec_scal<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a, inline void vec_scal<float, platform::avx512f>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_scal<float, platform::jit::avx2>(n, a, x, y); vec_scal<float, platform::avx2>(n, a, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a - x[i]; y[i] = a - x[i];
...@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { ...@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a, inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y); vec_bias_sub<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a, ...@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
y[i] = a - x[i]; y[i] = a - x[i];
} }
#else #else
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y); vec_bias_sub<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a, inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_bias_sub<float, platform::jit::avx>(n, a, x, y); vec_bias_sub<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx512f>(const int n, inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
const float a, const float* x, float* y) {
const float* x,
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y); vec_bias_sub<float, platform::avx2>(n, a, x, y);
} }
// out = x*y + (1-x)*z // out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i]; out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
...@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { ...@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x, inline void vec_cross<float, platform::avx>(const int n, const float* x,
const float* y, const float* z, const float* y, const float* z,
float* out) { float* out) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out); vec_cross<float, platform::isa_any>(n, x, y, z, out);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x, ...@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
} }
#else #else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out); vec_cross<float, platform::isa_any>(n, x, y, z, out);
#endif #endif
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x, inline void vec_cross<float, platform::avx2>(const int n, const float* x,
const float* y, const float* y, const float* z,
const float* z, float* out) { float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out); vec_cross<float, platform::avx>(n, x, y, z, out);
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx512f>( inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
const int n, const float* x, const float* y, const float* z, float* out) { const float* y, const float* z,
float* out) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out); vec_cross<float, platform::avx>(n, x, y, z, out);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) { inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] + a; y[i] = x[i] + a;
...@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { ...@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a, inline void vec_add_bias<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_add_bias<float, platform::jit::isa_any>(n, a, x, y); vec_add_bias<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a, ...@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
y[i] = x[i] + a; y[i] = x[i] + a;
} }
#else #else
vec_add_bias<float, platform::jit::isa_any>(n, a, x, y); vec_add_bias<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a, inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_add_bias<float, platform::jit::avx>(n, a, x, y); vec_add_bias<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx512f>(const int n, inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
const float a, const float* x, float* y) {
const float* x,
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_add_bias<float, platform::jit::avx2>(n, a, x, y); vec_add_bias<float, platform::avx2>(n, a, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_identity(const int n, const T* x, T* y) { inline void vec_identity(const int n, const T* x, T* y) {
// do nothing // do nothing
return; return;
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) { inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN; const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX; const T max = SIGMOID_THRESHOLD_MAX;
...@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { ...@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x, inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_sigmoid<float, platform::jit::isa_any>(n, x, y); vec_sigmoid<float, platform::isa_any>(n, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x, ...@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
y[i] = 1.f / (1.f + y[i]); y[i] = 1.f / (1.f + y[i]);
} }
#else #else
vec_sigmoid<float, platform::jit::isa_any>(n, x, y); vec_sigmoid<float, platform::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x, inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
float* y) { float* y) {
vec_sigmoid<float, platform::jit::avx>(n, x, y); vec_sigmoid<float, platform::avx>(n, x, y);
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx512f>(const int n, inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_sigmoid<float, platform::jit::avx2>(n, x, y); vec_sigmoid<float, platform::avx2>(n, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) { inline void vec_tanh(const int n, const T* x, T* y) {
vec_scal<T, isa>(n, static_cast<T>(2), x, y); vec_scal<T, isa>(n, static_cast<T>(2), x, y);
vec_sigmoid<T, isa>(n, y, y); vec_sigmoid<T, isa>(n, y, y);
...@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { ...@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
} }
// TODO(TJ): make relu clip // TODO(TJ): make relu clip
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_relu(const int n, const T* x, T* y) { inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0; y[i] = x[i] > 0 ? x[i] : 0;
...@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { ...@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x, inline void vec_relu<float, platform::avx>(const int n, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block * 4) { if (n < block * 4) {
vec_relu<float, platform::jit::isa_any>(n, x, y); vec_relu<float, platform::isa_any>(n, x, y);
return; return;
} }
...@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x, ...@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
#undef MOVE_ONE_STEP #undef MOVE_ONE_STEP
#else #else
vec_relu<float, platform::jit::isa_any>(n, x, y); vec_relu<float, platform::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x, inline void vec_relu<float, platform::avx2>(const int n, const float* x,
float* y) { float* y) {
vec_relu<float, platform::jit::avx>(n, x, y); vec_relu<float, platform::avx>(n, x, y);
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x, inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_relu<float, platform::jit::avx2>(n, x, y); vec_relu<float, platform::avx2>(n, x, y);
} }
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
class VecActivations { class VecActivations {
public: public:
std::function<void(const int, const T*, T*)> operator()( std::function<void(const int, const T*, T*)> operator()(
......
...@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt, ...@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
} }
TEST(CpuVecTest, sigmoid) { TEST(CpuVecTest, sigmoid) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>, TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, tanh) { TEST(CpuVecTest, tanh) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
ref_tanh<float>);
} }
TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>); TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, relu) { TEST(CpuVecTest, relu) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
ref_relu<float>);
} }
TestAndBench<double>(30, vec_relu<double>, ref_relu<double>); TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt, ...@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
} }
TEST(CpuVecTest, inplace_sigmoid) { TEST(CpuVecTest, inplace_sigmoid) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>, TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, inplace_tanh) { TEST(CpuVecTest, inplace_tanh) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
} }
TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>); TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, inplace_relu) { TEST(CpuVecTest, inplace_relu) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_relu<float>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
} }
TestInplace<double>(30, vec_relu<double>, ref_relu<double>); TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -22,7 +22,7 @@ namespace math { ...@@ -22,7 +22,7 @@ namespace math {
namespace jitkernel { namespace jitkernel {
namespace gen { namespace gen {
using namespace platform::jit; // NOLINT using namespace platform; // NOLINT
bool VXXJitCode::init(int d, int scalar_index) { bool VXXJitCode::init(int d, int scalar_index) {
// It's not necessary to use avx512 since it would slow down the frequency // It's not necessary to use avx512 since it would slow down the frequency
......
...@@ -179,7 +179,7 @@ class VActJitCode : public JitCode { ...@@ -179,7 +179,7 @@ class VActJitCode : public JitCode {
template <typename JMM> template <typename JMM>
void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT
int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
using namespace platform::jit; // NOLINT using namespace platform; // NOLINT
// check all idx can not equal // check all idx can not equal
JMM jmm_src = JMM(src_idx); JMM jmm_src = JMM(src_idx);
JMM jmm_fx = JMM(fx_idx); JMM jmm_fx = JMM(fx_idx);
......
...@@ -36,7 +36,7 @@ void JitCode::preCode() { ...@@ -36,7 +36,7 @@ void JitCode::preCode() {
for (int i = 0; i < num_g_abi_regs; ++i) { for (int i = 0; i < num_g_abi_regs; ++i) {
push(Xbyak::Reg64(g_abi_regs[i])); push(Xbyak::Reg64(g_abi_regs[i]));
} }
if (platform::jit::MayIUse(platform::jit::avx512f)) { if (platform::MayIUse(platform::avx512f)) {
mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
} }
} }
......
...@@ -21,8 +21,6 @@ namespace operators { ...@@ -21,8 +21,6 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
KernelPool& KernelPool::Instance() { KernelPool& KernelPool::Instance() {
static thread_local KernelPool g_jit_kernels; static thread_local KernelPool g_jit_kernels;
return g_jit_kernels; return g_jit_kernels;
......
...@@ -30,7 +30,6 @@ namespace paddle { ...@@ -30,7 +30,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <typename T> template <typename T>
...@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) { ...@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <> template <>
bool VMulKernelImpl<float>::useMKL(int d) { bool VMulKernelImpl<float>::useMKL(int d) {
return jit::MayIUse(jit::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
} }
template <> template <>
......
...@@ -25,10 +25,8 @@ namespace operators { ...@@ -25,10 +25,8 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
/* CRF Decode JitKernel */ /* CRF Decode JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::cpu_isa_t isa, jit_block>
class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
public: public:
explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() { explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
...@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
#define INTRIAVX_FLOAT(block) \ #define INTRIAVX_FLOAT(block) \
template <> \ template <> \
CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl( \ CRFDecodeKernelImpl<float, platform::avx, block>::CRFDecodeKernelImpl( \
int tag_num) \ int tag_num) \
: CRFDecodeKernel<float>() { \ : CRFDecodeKernel<float>() { \
this->num_ = tag_num; \ this->num_ = tag_num; \
...@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \
} \ } \
template <> \ template <> \
void CRFDecodeKernelImpl<float, jit::avx, block>::Compute( \ void CRFDecodeKernelImpl<float, platform::avx, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \ const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \ int* track) const { \
INIT_ALPHA(YMM_FLOAT_BLOCK) \ INIT_ALPHA(YMM_FLOAT_BLOCK) \
...@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
#define INTRIAVX512_FLOAT(block) \ #define INTRIAVX512_FLOAT(block) \
template <> \ template <> \
CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl( \ CRFDecodeKernelImpl<float, platform::avx512f, block>::CRFDecodeKernelImpl( \
int tag_num) \ int tag_num) \
: CRFDecodeKernel<float>() { \ : CRFDecodeKernel<float>() { \
this->num_ = tag_num; \ this->num_ = tag_num; \
...@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \
} \ } \
template <> \ template <> \
void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute( \ void CRFDecodeKernelImpl<float, platform::avx512f, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \ const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \ int* track) const { \
INIT_ALPHA(ZMM_FLOAT_BLOCK) \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \
...@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); ...@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16);
INTRIAVX_FLOAT(kGT16); INTRIAVX_FLOAT(kGT16);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRIAVX2_FLOAT(jit::avx2, kEQ8); INTRIAVX2_FLOAT(platform::avx2, kEQ8);
INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); INTRIAVX2_FLOAT(platform::avx2, kGT8LT16);
INTRIAVX2_FLOAT(jit::avx2, kEQ16); INTRIAVX2_FLOAT(platform::avx2, kEQ16);
INTRIAVX2_FLOAT(jit::avx2, kGT16); INTRIAVX2_FLOAT(platform::avx2, kGT16);
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRIAVX2_FLOAT(jit::avx512f, kEQ8); INTRIAVX2_FLOAT(platform::avx512f, kEQ8);
INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16);
INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kEQ16);
INTRIAVX512_FLOAT(kGT16); INTRIAVX512_FLOAT(kGT16);
#endif #endif
......
...@@ -29,7 +29,6 @@ namespace paddle { ...@@ -29,7 +29,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
// try to use MKL to speedup // try to use MKL to speedup
......
...@@ -22,10 +22,8 @@ namespace operators { ...@@ -22,10 +22,8 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
/* Layer Norm JitKernel */ /* Layer Norm JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::cpu_isa_t isa, jit_block>
class LayerNormKernelImpl : public LayerNormKernel<T> { class LayerNormKernelImpl : public LayerNormKernel<T> {
public: public:
explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() { explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
...@@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
this->end_ = this->num_ - this->rest_; \ this->end_ = this->num_ - this->rest_; \
} \ } \
template <> \ template <> \
void LayerNormKernelImpl<float, jit::avx, block>::Compute( \ void LayerNormKernelImpl<float, platform::avx, block>::Compute( \
float* x, float* out, float* mean, float* var, const float* scale, \ float* x, float* out, float* mean, float* var, const float* scale, \
const float* bias, int height, const float epsilon) const { \ const float* bias, int height, const float epsilon) const { \
__m256 sum; \ __m256 sum; \
...@@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
} }
#ifdef __AVX__ #ifdef __AVX__
INTRIAVX_FLOAT(jit::avx, kEQ8); INTRIAVX_FLOAT(platform::avx, kEQ8);
INTRIAVX_FLOAT(jit::avx, kGT8LT16); INTRIAVX_FLOAT(platform::avx, kGT8LT16);
INTRIAVX_FLOAT(jit::avx, kEQ16); INTRIAVX_FLOAT(platform::avx, kEQ16);
INTRIAVX_FLOAT(jit::avx, kGT16); INTRIAVX_FLOAT(platform::avx, kGT16);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRIAVX_FLOAT(jit::avx2, kEQ8); INTRIAVX_FLOAT(platform::avx2, kEQ8);
INTRIAVX_FLOAT(jit::avx2, kGT8LT16); INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
INTRIAVX_FLOAT(jit::avx2, kEQ16); INTRIAVX_FLOAT(platform::avx2, kEQ16);
INTRIAVX_FLOAT(jit::avx2, kGT16); INTRIAVX_FLOAT(platform::avx2, kGT16);
#endif #endif
#undef INTRIAVX_FLOAT #undef INTRIAVX_FLOAT
......
...@@ -92,7 +92,6 @@ namespace jitkernel { ...@@ -92,7 +92,6 @@ namespace jitkernel {
JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \
JITKERNEL_IMPL) JITKERNEL_IMPL)
namespace jit = platform::jit;
// TODO(TJ): below defines are deprecated, would be remove recently // TODO(TJ): below defines are deprecated, would be remove recently
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ #define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < YMM_FLOAT_BLOCK) { \ if (d < YMM_FLOAT_BLOCK) { \
...@@ -108,14 +107,14 @@ namespace jit = platform::jit; ...@@ -108,14 +107,14 @@ namespace jit = platform::jit;
} }
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ #define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \ if (platform::MayIUse(platform::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \ } else if (platform::MayIUse(platform::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \
} else if (jit::MayIUse(jit::avx)) { \ } else if (platform::MayIUse(platform::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \
} else { \ } else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
} }
#define JITKERNEL_KEY(ker_key, dtype_key) \ #define JITKERNEL_KEY(ker_key, dtype_key) \
...@@ -156,10 +155,10 @@ namespace jit = platform::jit; ...@@ -156,10 +155,10 @@ namespace jit = platform::jit;
marco_declare, macro_key, macro_impl) marco_declare, macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \ #define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \ macro_(platform::avx512f, block); \
macro_(jit::avx2, block); \ macro_(platform::avx2, block); \
macro_(jit::avx, block); \ macro_(platform::avx, block); \
macro_(jit::isa_any, block) macro_(platform::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \ #define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \ macro_(isa, kLT8); \
...@@ -169,10 +168,10 @@ namespace jit = platform::jit; ...@@ -169,10 +168,10 @@ namespace jit = platform::jit;
macro_(isa, kGT16) macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \ #define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \ FOR_EACH_BLOCK(macro_, platform::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \ FOR_EACH_BLOCK(macro_, platform::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \ FOR_EACH_BLOCK(macro_, platform::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any) FOR_EACH_BLOCK(macro_, platform::isa_any)
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -705,7 +705,7 @@ TEST(JitKernel, pool) { ...@@ -705,7 +705,7 @@ TEST(JitKernel, pool) {
jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
// empty call it to avoid unknown flag 'use_pinned_memory' on Mac // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
paddle::platform::jit::MayIUse(paddle::platform::jit::avx); paddle::platform::MayIUse(paddle::platform::avx);
const auto& plstm1 = const auto& plstm1 =
jit::KernelPool::Instance() jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr); .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
......
...@@ -89,6 +89,8 @@ template <typename T> ...@@ -89,6 +89,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
const framework::Tensor& weight, const framework::Tensor& weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat->dims()[0]; size_t num_samples = tmat->dims()[0];
size_t tmat_width = tmat->dims()[1]; size_t tmat_width = tmat->dims()[1];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
...@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, ...@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_row = input_value + input_width * i;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); size_t index = code->calc_index(j);
const T* weight_row = weight_value + weight_width * index;
T sum = static_cast<T>(0.0); T sum = static_cast<T>(0.0);
for (size_t k = 0; k < input_width; ++k) { sum = blas.DOT(input_width, weight_row, input_row);
sum += weight_value[weight_width * index + k] *
input_value[input_width * i + k];
}
tmat_value[i * tmat_width + j] += sum; tmat_value[i * tmat_width + j] += sum;
} }
} }
...@@ -115,6 +116,8 @@ template <typename T> ...@@ -115,6 +116,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::Tensor* weight, framework::Tensor* weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
...@@ -122,17 +125,26 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -122,17 +125,26 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto tmat_value = tmat.data<T>(); auto tmat_value = tmat.data<T>();
auto weight_value = weight->data<T>(); auto weight_value = weight->data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_value_row = input_value + input_width * i;
const T* tmat_row = tmat_value + i * tmat_width;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
for (size_t k = 0; k < input_width; ++k) {
weight_value[weight_width * index + k] +=
tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
} }
} }
for (auto& op : ops) {
auto& op_in_row = op.second;
for (auto& pair : op_in_row) {
auto& scale = pair.first;
auto* input_row = pair.second;
T* weight_row = weight_value + op.first * weight_width;
blas.AXPY(input_width, scale, input_row, weight_row);
}
} }
} }
...@@ -140,6 +152,8 @@ template <typename T> ...@@ -140,6 +152,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::SelectedRows* weight, framework::SelectedRows* weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
...@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto tmat_value = tmat.data<T>(); auto tmat_value = tmat.data<T>();
auto weight_value = weight->mutable_value()->data<T>(); auto weight_value = weight->mutable_value()->data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
ops.reserve(weight->rows().size());
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_value_row = input_value + input_width * i;
const T* tmat_row = tmat_value + i * tmat_width;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
for (size_t k = 0; k < input_width; ++k) { }
int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
weight_value[row_index * weight_width + k] +=
tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
} }
for (auto& row : weight->rows()) {
auto& op_in_row = ops[row];
for (auto& pair : op_in_row) {
auto& scale = pair.first;
auto* input_row = pair.second;
blas.AXPY(input_width, scale, input_row, weight_value);
} }
weight_value += weight_width;
} }
} }
......
...@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32) #if defined(_WIN32)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle {
namespace operators {
namespace math {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template <typename T>
__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
T *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T *out = output + offset;
T scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
template <typename T>
__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
T *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
const T *scale = alpha + offset;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
}
}
template <typename T>
__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T scale = *alpha;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
template <typename T>
static inline void PReluChannelWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
}
template <typename T>
static inline void PReluElementWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
static inline void PReluScalar(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
}
template <typename T>
void PreluElementWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
const T *input, const T *alpha,
T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template class PreluChannelWiseDirectCUDAFunctor<float>;
template class PreluChannelWiseDirectCUDAFunctor<double>;
template class PreluElementWiseDirectCUDAFunctor<float>;
template class PreluElementWiseDirectCUDAFunctor<double>;
template class PreluScalarDirectCUDAFunctor<float>;
template class PreluScalarDirectCUDAFunctor<double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
namespace math {
#ifdef PADDLE_WITH_CUDA
template <typename T>
class PreluChannelWiseDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
template <typename T>
class PreluElementWiseDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
template <typename T>
class PreluScalarDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
#endif
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/merge_selected_rows_op.h"
namespace paddle {
namespace operators {
class MergeSelectedRowsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MergeSelectedRowsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MergeSelectedRowsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out");
}
};
class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input type is SelectedRows, and the selected rows may be "
"duplicated.");
AddOutput("Out",
"The output type is SelectedRows, and the selected rows are not "
"duplicated.");
AddComment(
R"DOC(
MergeSelectedRows Operator.
MergeSelectedRows is used to merge the duplicated rows of the input.
)DOC");
}
};
class MergeSelectedRowsOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp,
ops::MergeSelectedRowsOpMaker,
ops::MergeSelectedRowsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
merge_selected_rows,
ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/merge_selected_rows_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
merge_selected_rows,
ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class MergeSelectedRowsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<framework::SelectedRows>("X");
auto* out = context.Output<framework::SelectedRows>("Out");
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(context.template device_context<DeviceContext>(), *x, out);
}
};
} // namespace operators
} // namespace paddle
...@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel { ...@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/prelu.h"
#include "paddle/fluid/operators/prelu_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class CUDAPReluKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* alpha = context.Input<Tensor>("Alpha");
auto* out = context.Output<Tensor>("Out");
const T* x_ptr = x->data<T>();
T* o_ptr = out->mutable_data<T>(context.GetPlace());
const T* alpha_ptr = alpha->data<T>();
auto& mode = context.Attr<std::string>("mode");
int numel = x->numel();
auto dim = x->dims();
std::vector<int> input_shape = framework::vectorize2int(dim);
if (mode == "channel") {
math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
alpha_ptr, o_ptr, input_shape);
} else if (mode == "element") {
math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
alpha_ptr, o_ptr, input_shape);
} else {
math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
o_ptr, input_shape);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { ...@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256; return CUDAPinnedMaxAllocSize() / 256;
} }
namespace jit {
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu; static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) { bool MayIUse(const cpu_isa_t cpu_isa) {
...@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { ...@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
} }
#endif #endif
} // namespace jit
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); ...@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize(); size_t CUDAPinnedMaxChunkSize();
namespace jit {
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
...@@ -55,7 +54,5 @@ typedef enum { ...@@ -55,7 +54,5 @@ typedef enum {
// May I use some instruction // May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa); bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace jit
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
auto *kernel = auto *kernel =
reinterpret_cast<const CUpti_ActivityKernel3 *>(record); reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
tracer->AddKernelRecords(kernel->start, kernel->end, tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
kernel->deviceId, kernel->streamId, kernel->deviceId, kernel->streamId,
kernel->correlationId); kernel->correlationId);
break; break;
...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) { int64_t device_id, int64_t stream_id,
uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(3) << correlation_id << " cannot be traced";
...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back( kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id}); KernelRecord{name, start, end, device_id, stream_id, correlation_id});
} }
bool IsEnabled() { bool IsEnabled() {
...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer {
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
continue;
}
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) {
event->set_name(correlations_.at(r.correlation_id)); event->set_name(correlations_.at(r.correlation_id));
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
......
...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { ...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() {
class DeviceTracer { class DeviceTracer {
public: public:
struct KernelRecord { struct KernelRecord {
std::string name;
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
int64_t device_id; int64_t device_id;
...@@ -84,8 +85,9 @@ class DeviceTracer { ...@@ -84,8 +85,9 @@ class DeviceTracer {
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) = 0; int64_t device_id, int64_t stream_id,
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
......
...@@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \ __macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \ __macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \ __macro(cudnnDestroyRNNDescriptor);
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
...@@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
// APIs in R6
#if CUDNN_VERSION >= 6000
#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount); \ __macro(cudnnSetConvolutionGroupCount); \
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
#ifndef _WIN32 #ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
...@@ -45,6 +46,15 @@ DEFINE_bool( ...@@ -45,6 +46,15 @@ DEFINE_bool(
"input and output must be half precision) and recurrent neural networks " "input and output must be half precision) and recurrent neural networks "
"(RNNs)."); "(RNNs).");
DEFINE_string(selected_gpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
"share-memory only.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -121,6 +131,24 @@ int GetCurrentDeviceId() { ...@@ -121,6 +131,24 @@ int GetCurrentDeviceId() {
return device_id; return device_id;
} }
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i); ...@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i);
//! Get the current GPU device id in system. //! Get the current GPU device id in system.
int GetCurrentDeviceId(); int GetCurrentDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
//! Set the GPU device id for next execution. //! Set the GPU device id for next execution.
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) { ...@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) {
std::vector<int> devices; std::vector<int> devices;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
try { try {
int count = platform::GetCUDADeviceCount(); // use user specified GPUs in single-node multi-process mode.
for (int i = 0; i < count; ++i) { devices = platform::GetSelectedDevices();
devices.push_back(i);
}
} catch (const std::exception &exp) { } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
...@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) { ...@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) {
void InitDevices(bool init_p2p, const std::vector<int> devices) { void InitDevices(bool init_p2p, const std::vector<int> devices) {
std::vector<platform::Place> places; std::vector<platform::Place> places;
int count = 0;
#ifdef PADDLE_WITH_CUDA
try {
count = platform::GetCUDADeviceCount();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#endif
for (size_t i = 0; i < devices.size(); ++i) { for (size_t i = 0; i < devices.size(); ++i) {
if (devices[i] >= count || devices[i] < 0) { // In multi process multi gpu mode, we may have gpuid = 7
// but count = 1.
if (devices[i] < 0) {
LOG(WARNING) << "Invalid devices id."; LOG(WARNING) << "Invalid devices id.";
continue; continue;
} }
places.emplace_back(platform::CUDAPlace(devices[i])); places.emplace_back(platform::CUDAPlace(devices[i]));
} }
if (init_p2p) { if (init_p2p) {
...@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
#ifndef __AVX__ #ifndef __AVX__
LOG(WARNING) << "AVX is available, Please re-compile on local machine"; LOG(WARNING) << "AVX is available, Please re-compile on local machine";
#endif #endif
...@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
" version or compile from source code." " version or compile from source code."
#ifdef __AVX512F__ #ifdef __AVX512F__
if (!platform::jit::MayIUse(platform::jit::avx512f)) { if (!platform::MayIUse(platform::avx512f)) {
if (platform::jit::MayIUse(platform::jit::avx2)) { if (platform::MayIUse(platform::avx2)) {
AVX_GUIDE(AVX512, AVX2); AVX_GUIDE(AVX512, AVX2);
} else if (platform::jit::MayIUse(platform::jit::avx)) { } else if (platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX512, AVX); AVX_GUIDE(AVX512, AVX);
} else { } else {
AVX_GUIDE(AVX512, NonAVX); AVX_GUIDE(AVX512, NonAVX);
...@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
if (!platform::jit::MayIUse(platform::jit::avx2)) { if (!platform::MayIUse(platform::avx2)) {
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX2, AVX); AVX_GUIDE(AVX2, AVX);
} else { } else {
AVX_GUIDE(AVX2, NonAVX); AVX_GUIDE(AVX2, NonAVX);
...@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#ifdef __AVX__ #ifdef __AVX__
if (!platform::jit::MayIUse(platform::jit::avx)) { if (!platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX, NonAVX); AVX_GUIDE(AVX, NonAVX);
} }
#endif #endif
......
...@@ -97,7 +97,7 @@ struct NCCLContextMap { ...@@ -97,7 +97,7 @@ struct NCCLContextMap {
order_.size(), contexts_.size(), order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
if (places.size() <= 1) { if (places.size() <= 1 && num_trainers == 1) {
return; return;
} }
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]); std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
...@@ -111,12 +111,19 @@ struct NCCLContextMap { ...@@ -111,12 +111,19 @@ struct NCCLContextMap {
{ {
int nranks = num_trainers * order_.size(); int nranks = num_trainers * order_.size();
NCCLGroupGuard gurad; NCCLGroupGuard gurad;
for (auto &gpu_id : order_) { for (size_t i = 0; i < order_.size(); ++i) {
int rank = trainer_id * order_.size() + gpu_id; int gpu_id = order_[i];
VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; int rank;
if (order_.size() > 1) {
rank = trainer_id * order_.size() + i;
} else {
rank = trainer_id;
}
VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
<< "gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
comms.get() + gpu_id, nranks, *nccl_id, rank)); comms.get() + i, nranks, *nccl_id, rank));
} }
} }
} }
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
if(WITH_PYTHON) if(WITH_PYTHON)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/tracer.h"
namespace paddle {
namespace pybind {
// Bind Methods
void BindTracer(pybind11::module *m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) {
new (&self) imperative::Tracer(root_block);
})
.def("trace", &imperative::Tracer::Trace)
.def("get_scope", &imperative::Tracer::GetScope,
pybind11::return_value_policy::reference);
}
} // namespace pybind
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace paddle {
namespace pybind {
class PyLayer : public imperative::Layer {
public:
using imperative::Layer::Layer; // Inherit constructors
std::vector<imperative::VarBase> Forward(
const std::vector<imperative::VarBase>& inputs) override {
PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
inputs); // NOLINT
}
void Backward() override {
PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT
}
};
class PyOpBase : public imperative::OpBase {
public:
using imperative::OpBase::OpBase; // Inherit constructors
};
class PyVarBase : public imperative::VarBase {
public:
using imperative::VarBase::VarBase; // Inherit constructors
};
void BindTracer(pybind11::module* m);
} // namespace pybind
} // namespace paddle
...@@ -34,6 +34,7 @@ limitations under the License. */ ...@@ -34,6 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
...@@ -45,6 +46,7 @@ limitations under the License. */ ...@@ -45,6 +46,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/async_executor_py.h"
#include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/recordio.h"
...@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) { ...@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) {
BindException(&m); BindException(&m);
py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
.def(py::init<>())
.def("_run_backward",
[](imperative::VarBase &self, framework::Scope *scope) {
self.RunBackward(scope);
})
.def("_grad", &imperative::VarBase::Grad)
.def_property(
"desc",
[](const imperative::VarBase &self) { return self.var_desc_; },
[](imperative::VarBase &self, framework::VarDesc *var_desc) {
self.var_desc_ = var_desc;
},
py::return_value_policy::reference);
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<>())
.def_property(
"desc", [](const imperative::OpBase &self) { return self.op_desc_; },
[](imperative::OpBase &self, framework::OpDesc *op_desc) {
if (op_desc) {
self.op_desc_ = op_desc;
}
},
py::return_value_policy::reference);
py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>())
.def("forward",
[](imperative::Layer &self,
const std::vector<imperative::VarBase> &inputs) {
return self.Forward(inputs);
})
.def("backward", &imperative::Layer::Backward);
BindTracer(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol()) py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer( .def_buffer(
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
...@@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
m.def("get_variable_tensor", framework::GetVariableTensor);
m.def("_is_program_version_supported", IsProgramVersionSupported); m.def("_is_program_version_supported", IsProgramVersionSupported);
......
...@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc) ...@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc) cc_test(to_string_test SRCS to_string_test.cc)
cc_test(split_test SRCS split_test.cc)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sstream>
#include <string>
#include <vector>
namespace paddle {
namespace string {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
} // namespace string
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/string/split.h"
#include <string>
#include "gtest/gtest.h"
TEST(StringSplit, StringSplit) {
std::string to_split = "0,1,2,3,4,5";
int i = 0;
for (auto s : paddle::string::Split(to_split, ',')) {
EXPECT_EQ(atoi(s.c_str()), i);
i++;
}
}
...@@ -437,13 +437,11 @@ EOF ...@@ -437,13 +437,11 @@ EOF
export http_proxy= export http_proxy=
export https_proxy= export https_proxy=
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j $1 ctest --output-on-failure -j $2
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
set -e
python -c "import paddle.fluid"
elif [ "$1" == "cp35-cp35m" ]; then elif [ "$1" == "cp35-cp35m" ]; then
pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp36-cp36m" ]; then elif [ "$1" == "cp36-cp36m" ]; then
...@@ -918,7 +916,7 @@ function main() { ...@@ -918,7 +916,7 @@ function main() {
maccheck) maccheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build_mac build_mac
run_mac_test ${PROC_RUN:-1} run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
;; ;;
macbuild) macbuild)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
......
...@@ -34,6 +34,7 @@ from . import io ...@@ -34,6 +34,7 @@ from . import io
from . import evaluator from . import evaluator
from . import initializer from . import initializer
from . import layers from . import layers
from . import imperative
from . import contrib from . import contrib
from . import nets from . import nets
from . import optimizer from . import optimizer
...@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ ...@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'initializer', 'initializer',
'layers', 'layers',
'contrib', 'contrib',
'imperative',
'transpiler', 'transpiler',
'nets', 'nets',
'optimizer', 'optimizer',
...@@ -147,7 +149,7 @@ def __bootstrap__(): ...@@ -147,7 +149,7 @@ def __bootstrap__():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search' 'cudnn_exhaustive_search', 'selected_gpus'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
......
...@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
"All parameters' 'clip_norm' of a same group should be the same" "All parameters' 'clip_norm' of a same group should be the same"
) )
square = grad * grad merge_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(grad)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
square = layers.square(merge_grad)
local_norm_var = layers.reduce_sum(input=square) local_norm_var = layers.reduce_sum(input=square)
context[self.group_name].append(local_norm_var) context[self.group_name].append(local_norm_var)
...@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
new_grad = layers.elementwise_mul( new_grad = layers.elementwise_mul(
x=grad, y=self.context[group_scale_name]) x=grad, y=self.context[group_scale_name])
return param, new_grad return param, new_grad
......
...@@ -18,6 +18,7 @@ import collections ...@@ -18,6 +18,7 @@ import collections
import contextlib import contextlib
import re import re
import six import six
import sys
import numpy as np import numpy as np
...@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ...@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
_imperative_tracer_ = None
def _in_imperative_mode():
return _imperative_tracer_ is not None
def _imperative_tracer():
return _imperative_tracer_
class NameScope(object): class NameScope(object):
def __init__(self, name="", parent=None): def __init__(self, name="", parent=None):
...@@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True): ...@@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True):
return proto.__str__() return proto.__str__()
class Variable(object): class Variable(core.VarBase):
""" """
In Fluid, every input and output of an operator is a variable. In most In Fluid, every input and output of an operator is a variable. In most
cases, variables are used for holding different kinds of data or training cases, variables are used for holding different kinds of data or training
...@@ -266,6 +277,7 @@ class Variable(object): ...@@ -266,6 +277,7 @@ class Variable(object):
stop_gradient=False, stop_gradient=False,
is_data=False, is_data=False,
**kwargs): **kwargs):
core.VarBase.__init__(self)
self.block = block self.block = block
self.error_clip = error_clip self.error_clip = error_clip
...@@ -346,6 +358,18 @@ class Variable(object): ...@@ -346,6 +358,18 @@ class Variable(object):
self.stop_gradient = stop_gradient self.stop_gradient = stop_gradient
self.is_data = is_data self.is_data = is_data
def _numpy(self):
scope = _imperative_tracer().get_scope(self.block.desc)
tensor = core.get_variable_tensor(scope, self.desc.name())
return np.array(tensor)
def _backward(self):
scope = _imperative_tracer().get_scope(self.block.desc)
self._run_backward(scope)
def _gradient(self):
return np.array(self._grad())
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
...@@ -492,7 +516,7 @@ class OpProtoHolder(object): ...@@ -492,7 +516,7 @@ class OpProtoHolder(object):
} }
class Operator(object): class Operator(core.OpBase):
""" """
In Fluid, all the operation are represented by Operator, and Operator In Fluid, all the operation are represented by Operator, and Operator
is regarded as a build in an instruction of a Block. Users can use the is regarded as a build in an instruction of a Block. Users can use the
...@@ -548,6 +572,7 @@ class Operator(object): ...@@ -548,6 +572,7 @@ class Operator(object):
inputs=None, inputs=None,
outputs=None, outputs=None,
attrs=None): attrs=None):
core.OpBase.__init__(self)
self.block = block self.block = block
self.desc = desc self.desc = desc
# note: not add self.attrs here: # note: not add self.attrs here:
...@@ -587,6 +612,7 @@ class Operator(object): ...@@ -587,6 +612,7 @@ class Operator(object):
return True return True
return False return False
self.inputs = []
if inputs is not None: if inputs is not None:
for in_proto in proto.inputs: for in_proto in proto.inputs:
found = find_name(inputs, in_proto.name) found = find_name(inputs, in_proto.name)
...@@ -613,6 +639,13 @@ class Operator(object): ...@@ -613,6 +639,13 @@ class Operator(object):
else: else:
self.desc.set_input(in_proto.name, []) self.desc.set_input(in_proto.name, [])
for inp in inputs.values():
if isinstance(inp, Variable):
self.inputs.append(inp)
elif isinstance(inp, list) or isinstance(inp, tuple):
self.inputs.extend(inp[:])
self.outputs = []
if outputs is not None: if outputs is not None:
given = set() given = set()
need = set() need = set()
...@@ -641,6 +674,12 @@ class Operator(object): ...@@ -641,6 +674,12 @@ class Operator(object):
arg.op = self arg.op = self
self.desc.set_output(out_proto.name, out_arg_names) self.desc.set_output(out_proto.name, out_arg_names)
for out in outputs.values():
if isinstance(out, Variable):
self.outputs.append(out)
elif isinstance(out, list) or isinstance(out, tuple):
self.outputs.extend(out[:])
if op_attrs is not None: if op_attrs is not None:
if not isinstance(op_attrs, dict): if not isinstance(op_attrs, dict):
raise TypeError("'attrs' should be a dict.") raise TypeError("'attrs' should be a dict.")
...@@ -1206,6 +1245,8 @@ class Block(object): ...@@ -1206,6 +1245,8 @@ class Block(object):
""" """
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
op = Operator(block=self, desc=op_desc, *args, **kwargs) op = Operator(block=self, desc=op_desc, *args, **kwargs)
if _in_imperative_mode():
_imperative_tracer().trace(op, op.inputs, op.outputs, self.desc)
self.ops.append(op) self.ops.append(op)
return op return op
...@@ -2209,3 +2250,12 @@ def _get_var(name, program=None): ...@@ -2209,3 +2250,12 @@ def _get_var(name, program=None):
assert isinstance(program, Program) assert isinstance(program, Program)
return program.global_block().var(name) return program.global_block().var(name)
@contextlib.contextmanager
def _imperative_guard(tracer):
global _imperative_tracer_
tmp_trace = _imperative_tracer_
_imperative_tracer_ = tracer
yield
_imperative_tracer_ = tmp_trace
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import base
from .base import *
from . import layers
from .layers import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np
from paddle.fluid import core
from paddle.fluid import framework
__all__ = ['enabled', 'guard', 'to_variable']
def enabled():
return framework._in_imperative_mode()
@contextlib.contextmanager
def guard():
train = framework.Program()
startup = framework.Program()
tracer = core.Tracer(train.current_block().desc)
with framework.program_guard(train, startup):
with framework.unique_name.guard():
with framework._imperative_guard(tracer):
yield
def to_variable(value, block=None):
if isinstance(value, np.ndarray):
if not block:
block = framework.default_main_program().current_block()
py_var = framework.Variable(
block,
type=core.VarDesc.VarType.LOD_TENSOR,
name=None,
shape=value.shape,
dtype=value.dtype)
scope = framework._imperative_tracer().get_scope(block.desc)
var = scope.var(py_var.name)
tensor = var.get_tensor()
tensor.set(value, core.CPUPlace())
return py_var
elif isinstance(value, framework.Variable):
return value
else:
raise ValueError("Unsupported type %s" % type(value))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import sys
import numpy as np
from paddle.fluid import core
from paddle.fluid import framework
from paddle.fluid.imperative import base
__all__ = ['PyLayer']
class PyLayer(core.Layer):
def __init__(self):
pass
def __call__(self, inputs):
# TODO(panyx0718): Support declarative mode as well.
assert base.enabled()
if not isinstance(inputs, list) and not isinstance(inputs, tuple):
inputs = [inputs]
var_inputs = []
for x in inputs:
py_var = base.to_variable(x)
var_inputs.append(py_var)
outputs = self.forward(var_inputs)
return outputs
def forward(self, inputs):
return []
...@@ -17,10 +17,13 @@ from __future__ import print_function ...@@ -17,10 +17,13 @@ from __future__ import print_function
import copy import copy
import itertools import itertools
import six import six
import sys
import numpy as np
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
from . import unique_name from . import unique_name
from paddle.fluid.initializer import Constant, Xavier from paddle.fluid.initializer import Constant, Xavier
from paddle.fluid.imperative import base
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from . import core from . import core
from six.moves import zip from six.moves import zip
...@@ -46,23 +49,21 @@ class LayerHelper(object): ...@@ -46,23 +49,21 @@ class LayerHelper(object):
def startup_program(self): def startup_program(self):
return default_startup_program() return default_startup_program()
def to_variable(self, x):
return base.to_variable(x, self.main_program.current_block())
def append_op(self, *args, **kwargs): def append_op(self, *args, **kwargs):
return self.main_program.current_block().append_op(*args, **kwargs) return self.main_program.current_block().append_op(*args, **kwargs)
def multiple_input(self, input_param_name='input'): def multiple_input(self, input_param_name='input'):
inputs = self.kwargs.get(input_param_name, []) inputs = self.kwargs.get(input_param_name, [])
type_error = TypeError( ret = []
"Input of {0} layer should be Variable or sequence of Variable". if isinstance(inputs, list) or isinstance(inputs, tuple):
format(self.layer_type)) for inp in inputs:
if isinstance(inputs, Variable): ret.append(self.to_variable(inp))
inputs = [inputs]
elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
raise type_error
else: else:
for each in inputs: ret.append(self.to_variable(inputs))
if not isinstance(each, Variable): return ret
raise type_error
return inputs
def input(self, input_param_name='input'): def input(self, input_param_name='input'):
inputs = self.multiple_input(input_param_name) inputs = self.multiple_input(input_param_name)
......
...@@ -169,6 +169,8 @@ __all__ = [ ...@@ -169,6 +169,8 @@ __all__ = [
'log_loss', 'log_loss',
'add_position_encoding', 'add_position_encoding',
'bilinear_tensor_product', 'bilinear_tensor_product',
'merge_selected_rows',
'get_tensor_from_selected_rows',
'lstm', 'lstm',
] ]
...@@ -6621,7 +6623,8 @@ def relu(x, name=None): ...@@ -6621,7 +6623,8 @@ def relu(x, name=None):
helper = LayerHelper('relu', **locals()) helper = LayerHelper('relu', **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) helper.append_op(
type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
return out return out
...@@ -8382,6 +8385,29 @@ def mean(x, name=None): ...@@ -8382,6 +8385,29 @@ def mean(x, name=None):
return out return out
@templatedoc()
def merge_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("merge_selected_rows", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="merge_selected_rows",
inputs={"X": x},
attrs={},
outputs={"Out": out})
return out
@templatedoc() @templatedoc()
def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
""" """
...@@ -9034,3 +9060,26 @@ def bilinear_tensor_product(x, ...@@ -9034,3 +9060,26 @@ def bilinear_tensor_product(x,
# add activation # add activation
return helper.append_activation(out) return helper.append_activation(out)
@templatedoc()
def get_tensor_from_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper('get_tensor_from_selected_rows', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='get_tensor_from_selected_rows',
inputs={'X': x},
outputs={'Out': out},
attrs={})
return out
...@@ -95,7 +95,14 @@ class ParallelExecutor(object): ...@@ -95,7 +95,14 @@ class ParallelExecutor(object):
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
gpus = []
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
for i in six.moves.range(core.get_cuda_device_count()): for i in six.moves.range(core.get_cuda_device_count()):
gpus.append(i)
for i in gpus:
p = core.Place() p = core.Place()
self._act_places.append(core.CUDAPlace(i)) self._act_places.append(core.CUDAPlace(i))
p.set_place(self._act_places[-1]) p.set_place(self._act_places[-1])
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
BATCH_SIZE = 128
CLIP = 1
prog = fluid.framework.Program()
with fluid.program_guard(main_program=prog):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone()
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
grad_list = [elem[1] for elem in p_g]
grad_clip_list = [elem[1] for elem in p_g_clip]
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(fluid.default_startup_program())
count = 0
for data in train_reader():
count += 1
if count > 5:
break
out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
out_clip = exe.run(prog_clip,
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out[1:]:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip[1:]:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
if not np.isclose(
a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3):
exit(1)
exit(0)
...@@ -43,13 +43,14 @@ if(APPLE) ...@@ -43,13 +43,14 @@ if(APPLE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_program_code) list(REMOVE_ITEM TEST_OPS test_program_code)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
# this op is not support on mac # this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed # TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
list(REMOVE_ITEM TEST_OPS test_gradient_clip)
endif() endif()
if(NOT WITH_MKLML) if(NOT WITH_MKLML)
# this op is not support on openblas # this op is not support on openblas
...@@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE) ...@@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE)
if(NOT APPLE) if(NOT APPLE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# FIXME(typhoonzero): add these tests back # FIXME(typhoonzero): add these tests back
# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# py_test_modules(test_dist_transformer MODULES test_dist_transformer) # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
# TODO(typhoonzero): make dist test parallel when fix port management issue set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif() endif()
......
...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): ...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
self.activation = 'identity' self.activation = 'identity'
class TestIdentityActivation(TestConv2dFusionOp):
def init_activation(self):
self.activation = 'identity'
self.add_residual_data = False
class TestWithGroup(TestConv2dFusionOp): class TestWithGroup(TestConv2dFusionOp):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
......
...@@ -291,8 +291,8 @@ class TestDistBase(unittest.TestCase): ...@@ -291,8 +291,8 @@ class TestDistBase(unittest.TestCase):
if check_error_log: if check_error_log:
err_log.close() err_log.close()
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
sys.stderr.write('local_stderr: %s\n' % local_err) sys.stderr.write('local_stderr: %s\n' % local_err)
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
return pickle.loads(local_out) return pickle.loads(local_out)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid.core as core
import numpy as np
from paddle.fluid.op import Operator
class TestGetTensorFromSelectedRows(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_with_place(self, place):
scope = core.Scope()
x_rows = [0, 5, 5, 4, 20]
height = 20
row_numel = 2
np_array = np.ones((len(x_rows), row_numel)).astype("float32")
np_array[1, :] = 2.0
np_array[2, :] = 3.0
np_array[3, :] = 4.0
# initialize input variable X
x = scope.var('X').get_selected_rows()
x.set_rows(x_rows)
x.set_height(height)
x_tensor = x.get_tensor()
x_tensor.set(np_array, place)
# initialize input variable Out
out = scope.var("Out").get_tensor()
op = Operator("get_tensor_from_selected_rows", X="X", Out="Out")
op.run(scope, place)
out_array = np.array(out)
self.assertEqual((5, 2), out_array.shape)
assert (out_array == np_array).all()
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
BATCH_SIZE = 128
CLIP = 1
def bow_net(data,
label,
dict_dim,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
emb = fluid.layers.embedding(
input=data, is_sparse=True, size=[dict_dim, emb_dim])
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
class TestGradientClip(unittest.TestCase):
def setUp(self):
self.word_dict = paddle.dataset.imdb.word_dict()
self.BATCH_SIZE = 2
self.train_data = paddle.batch(
paddle.dataset.imdb.train(self.word_dict),
batch_size=self.BATCH_SIZE)
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_operators(self, place):
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone()
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
grad_list = [elem[1] for elem in p_g]
grad_clip_list = [elem[1] for elem in p_g_clip]
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(startup_program)
count = 0
for data in train_reader():
count += 1
if count > 5:
break
out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
out_clip = exe.run(prog_clip,
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out[1:]:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip[1:]:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
assert np.isclose(
a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
def check_sparse_gradient_clip(self, place):
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost = bow_net(data, label, len(self.word_dict))
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(cost)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(startup_program)
data = next(self.train_data())
val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
self.assertEqual((1, ), val.shape)
print(val)
self.assertFalse(np.isnan(val))
def test_operators(self):
self.check_operators(core.CPUPlace())
def test_sparse_gradient_clip(self):
for place in self.get_places():
self.check_sparse_gradient_clip(place)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import sys
import numpy as np
import paddle.fluid as fluid
from paddle.fluid import core
class MyLayer(fluid.imperative.PyLayer):
def __init__(self):
super(MyLayer, self).__init__()
def forward(self, inputs):
x = fluid.layers.relu(inputs[0])
self._x_for_debug = x
return [fluid.layers.elementwise_mul(x, x)]
class TestImperative(unittest.TestCase):
def test_layer(self):
with fluid.imperative.guard():
cl = core.Layer()
cl.forward([])
l = fluid.imperative.PyLayer()
l.forward([])
def test_layer_in_out(self):
with fluid.imperative.guard():
l = MyLayer()
x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
self.assertIsNotNone(x)
sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
x._backward()
sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid.core as core
import numpy as np
from paddle.fluid.op import Operator
class TestMergeSelectedRows(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_with_place(self, place):
scope = core.Scope()
x_rows = [0, 5, 5, 4, 20]
out_rows = [0, 4, 5, 20]
height = 20
row_numel = 2
np_array = np.ones((len(x_rows), row_numel)).astype("float32")
np_array[1, :] = 2.0
np_array[2, :] = 3.0
np_array[3, :] = 4.0
# initialize input variable X
x = scope.var('X').get_selected_rows()
x.set_rows(x_rows)
x.set_height(height)
x_tensor = x.get_tensor()
x_tensor.set(np_array, place)
# initialize input variable Out
out = scope.var("Out").get_selected_rows()
op = Operator("merge_selected_rows", X="X", Out="Out")
op.run(scope, place)
self.assertEqual(out.rows(), out_rows)
self.assertEqual(out.height(), height)
out_array = np.array(out.get_tensor())
self.assertEqual((4, 2), out_array.shape)
assert (out_array[0, :] == 1.0).all()
assert (out_array[1, :] == 4.0).all()
assert (out_array[2, :] == 5.0).all()
assert (out_array[3, :] == 1.0).all()
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
...@@ -101,6 +101,7 @@ packages=['paddle', ...@@ -101,6 +101,7 @@ packages=['paddle',
'paddle.dataset', 'paddle.dataset',
'paddle.reader', 'paddle.reader',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.imperative',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers',
......
...@@ -27,6 +27,8 @@ import pydoc ...@@ -27,6 +27,8 @@ import pydoc
member_dict = collections.OrderedDict() member_dict = collections.OrderedDict()
experimental_namespace = {"paddle.fluid.imperative"}
def visit_member(parent_name, member): def visit_member(parent_name, member):
cur_name = ".".join([parent_name, member.__name__]) cur_name = ".".join([parent_name, member.__name__])
...@@ -51,6 +53,8 @@ def visit_member(parent_name, member): ...@@ -51,6 +53,8 @@ def visit_member(parent_name, member):
def visit_all_module(mod): def visit_all_module(mod):
if (mod.__name__ in experimental_namespace):
return
for member_name in ( for member_name in (
name name
for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册