提交 2c2a192e 编写于 作者: H hjchen2

Resolve merge conflicts

test=develop
...@@ -4,6 +4,7 @@ paddle/operators/tensor.save ...@@ -4,6 +4,7 @@ paddle/operators/tensor.save
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/ python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/ python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
paddle/fluid/operators/distributed/send_recv.proto
*.DS_Store *.DS_Store
*.vs *.vs
build/ build/
...@@ -28,4 +29,5 @@ third_party/ ...@@ -28,4 +29,5 @@ third_party/
build_* build_*
# clion workspace. # clion workspace.
cmake-build-* cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
model_test model_test
...@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") ...@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
add_subdirectory(paddle) add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
...@@ -312,10 +320,3 @@ if(WITH_DOC) ...@@ -312,10 +320,3 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED) find_python_module(recommonmark REQUIRED)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
...@@ -111,7 +111,7 @@ function(op_library TARGET) ...@@ -111,7 +111,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op") "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -30,6 +30,8 @@ class ExceptionHolder { ...@@ -30,6 +30,8 @@ class ExceptionHolder {
Catch(exp); Catch(exp);
} catch (platform::EnforceNotMet exp) { } catch (platform::EnforceNotMet exp) {
Catch(exp); Catch(exp);
} catch (std::exception& ex) {
LOG(FATAL) << "std::exception caught, " << ex.what();
} catch (...) { } catch (...) {
LOG(FATAL) << "Unknown exception caught"; LOG(FATAL) << "Unknown exception caught";
} }
......
...@@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
DeleteUnusedTensors(*local_scope, op.get(), gc.get(), DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
&(ctx->cur_ref_cnts_)); &(ctx->cur_ref_cnts_));
} }
if (FLAGS_benchmark) {
VLOG(20) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
} }
if (gc != nullptr) { if (gc != nullptr) {
...@@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
scope->DropKids(); scope->DropKids();
} }
} }
if (FLAGS_benchmark) {
VLOG(20) << "-------------------------------------------------------";
VLOG(20) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(20) << "-------------------------------------------------------";
}
} }
void Executor::RunPreparedContext( void Executor::RunPreparedContext(
......
...@@ -14,14 +14,15 @@ ...@@ -14,14 +14,15 @@
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <utility> #include <list>
#include <map>
#include <tuple>
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
// The function keeps the graph consistent by replacing // The function keeps the graph consistent by replacing
// a node 'from' in the set of inputs nodes // a node 'from' in the set of inputs nodes
...@@ -51,99 +52,382 @@ void CorrectGraphEdges(Graph* graph, Node* from, Node* to) { ...@@ -51,99 +52,382 @@ void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
} }
} }
} }
} // namespace
using graph_ptr = std::unique_ptr<ir::Graph>;
graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
FusePassBase::Init(name_scope_, graph.get()); auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
for (auto n : graph->Nodes()) {
if (n == node) {
return n;
}
}
GraphPatternDetector gpd; return nullptr;
auto pattern = gpd.mutable_pattern(); };
patterns::Conv conv_pattern{pattern, name_scope_}; if (from == to) {
auto conv_output = conv_pattern(); return true;
}
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; std::map<Node*, bool> visited;
elementwise_add_pattern(conv_output);
conv_output->AsIntermediate(); for (auto& node : GraphTraits::DFS(*graph)) {
visited[&node] = false;
}
auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> { visited[from] = true;
auto bias_input_names = conv_op.Op()->Inputs();
auto bias_it = bias_input_names.find("Bias");
if (bias_it != std::end(bias_input_names)) {
bool has_bias = !bias_it->second.empty();
if (has_bias) {
auto conv_bias_names = bias_it->second;
auto conv_bias_names_it =
std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
[&conv_bias_names](Node* n) -> bool {
return n->Name() == conv_bias_names[0];
});
return std::make_pair(has_bias, *conv_bias_names_it);
}
}
return std::make_pair(false, nullptr); std::list<Node*> queue;
}; queue.push_back(from);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, while (!queue.empty()) {
Graph* g) { auto cur = find_node(graph, queue.front());
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); queue.pop_front();
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; if (!cur) return false;
OpDesc op_desc; for (auto n : cur->outputs) {
op_desc.SetType("conv2d"); if (n == to) {
return true;
}
op_desc.SetInput("Input", {conv_input->Name()}); if (!visited[n]) {
op_desc.SetInput("Filter", {conv_filter->Name()}); visited[n] = true;
op_desc.SetInput("ResidualData", {elementwise_add_x->Name()}); queue.push_back(n);
op_desc.SetOutput("Output", {conv_output->Name()}); }
}
}
return false;
}
bool has_bias; boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) {
Node* conv_bias; auto bias_input_names = op.Op()->Inputs();
auto bias_it = bias_input_names.find(bias_name);
std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op); if (bias_it != std::end(bias_input_names)) {
bool has_bias = !bias_it->second.empty();
if (has_bias) { if (has_bias) {
op_desc.SetInput("Bias", {conv_bias->Name()}); auto bias_names = bias_it->second;
auto bias_names_it =
std::find_if(std::begin(op.inputs), std::end(op.inputs),
[&bias_names](Node* n) -> bool {
return n->Name() == bias_names[0];
});
return *bias_names_it;
} }
}
for (const auto& attr : conv_op->Op()->GetAttrMap()) { return boost::none;
op_desc.SetAttr(attr.first, attr.second); }
}
op_desc.SetAttr("fuse_residual_connection", true); ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
get_node_from_conv_op,
const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
get_node_from_elementwise_add_op)
: fusion_stats{std::make_shared<int>(0)},
can_fuse_func{can_fuse_func},
get_node_from_conv_op{get_node_from_conv_op},
get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* conv_op;
Node* conv_input;
Node* conv_filter;
Node* conv_output;
Node* elementwise_add_op;
Node* elementwise_add_identity;
Node* elementwise_add_out;
std::tie(conv_op, conv_input, conv_filter, conv_output) =
get_node_from_conv_op(subgraph);
std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
get_node_from_elementwise_add_op(subgraph);
if (!can_fuse_func(conv_op, elementwise_add_op)) return;
if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
OpDesc op_desc;
op_desc.SetType("conv2d");
op_desc.SetInput("Input", {conv_input->Name()});
op_desc.SetInput("Filter", {conv_filter->Name()});
op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()});
op_desc.SetOutput("Output", {conv_output->Name()});
auto conv_bias = HasBias(*conv_op, "Bias");
if (conv_bias) {
op_desc.SetInput("Bias", {(*conv_bias)->Name()});
}
auto fused_conv_op = g->CreateOpNode(&op_desc); for (const auto& attr : conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
IR_NODE_LINK_TO(conv_input, fused_conv_op); op_desc.SetAttr("fuse_residual_connection", true);
IR_NODE_LINK_TO(conv_filter, fused_conv_op);
IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, conv_output);
if (has_bias) { auto fused_conv_op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(conv_bias, fused_conv_op);
}
CorrectGraphEdges(g, elementwise_add_out, conv_output); IR_NODE_LINK_TO(conv_input, fused_conv_op);
GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op}); IR_NODE_LINK_TO(conv_filter, fused_conv_op);
}; IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, conv_output);
gpd(graph.get(), handler); if (conv_bias) {
IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, conv_output);
GraphSafeRemoveNodes(graph,
{elementwise_add_out, conv_op, elementwise_add_op});
(*fusion_stats)++;
}
ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
get_node_from_conv_x_op,
const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
get_node_from_conv_y_op,
const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
get_node_from_elementwise_add_op)
: fusion_stats{std::make_shared<int>(0)},
can_fuse_func{can_fuse_func},
get_node_from_conv_x_op{get_node_from_conv_x_op},
get_node_from_conv_y_op{get_node_from_conv_y_op},
get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* conv_x_op;
Node* conv_x_input;
Node* conv_x_filter;
Node* conv_x_output;
Node* conv_y_op;
Node* conv_y_input;
Node* conv_y_filter;
Node* conv_y_output;
Node* elementwise_add_op;
Node* elementwise_add_out;
std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
get_node_from_conv_x_op(subgraph);
std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
get_node_from_conv_y_op(subgraph);
std::tie(elementwise_add_op, elementwise_add_out) =
get_node_from_elementwise_add_op(subgraph);
if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
Node* projection_node;
Node* residual_conv_op;
Node* residual_conv_input;
Node* residual_conv_filter;
Node* residual_conv_output;
if (IsReachable(graph, conv_x_input, conv_y_output)) {
projection_node = conv_x_output;
residual_conv_op = conv_y_op;
residual_conv_input = conv_y_input;
residual_conv_filter = conv_y_filter;
residual_conv_output = conv_y_output;
} else if (IsReachable(graph, conv_y_input, conv_x_output)) {
projection_node = conv_y_output;
residual_conv_op = conv_x_op;
residual_conv_input = conv_x_input;
residual_conv_filter = conv_x_filter;
residual_conv_output = conv_x_output;
} else {
return;
}
OpDesc op_desc;
op_desc.SetType("conv2d");
op_desc.SetInput("Input", {residual_conv_input->Name()});
op_desc.SetInput("Filter", {residual_conv_filter->Name()});
op_desc.SetInput("ResidualData", {projection_node->Name()});
op_desc.SetOutput("Output", {residual_conv_output->Name()});
auto residual_conv_bias = HasBias(*residual_conv_op, "Bias");
if (residual_conv_bias) {
op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
}
for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
IR_NODE_LINK_TO(projection_node, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
if (residual_conv_bias) {
IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
GraphSafeRemoveNodes(
graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
(*fusion_stats)++;
}
std::tuple<Node*, Node*, Node*, Node*>
ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const {
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
ir::Graph* graph;
int stats;
std::tie(graph, stats) = graph_with_stats;
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
conv_output,
pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
conv_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_y,
elementwise_add_out);
};
return ExecuteHandleOnGraph<IdentityFuseHandle>(
&gpd, graph_with_stats,
[this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_pattern, subgraph);
},
get_node_from_elementwise_add);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
conv_output);
conv_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_x,
elementwise_add_out);
};
return ExecuteHandleOnGraph<IdentityFuseHandle>(
&gpd, graph_with_stats,
[this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_pattern, subgraph);
},
get_node_from_elementwise_add);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_x_pattern{pattern, name_scope};
auto conv_x_output = conv_x_pattern();
patterns::Conv conv_y_pattern{pattern, name_scope};
auto conv_y_output = conv_y_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(conv_x_output, conv_y_output);
conv_x_output->AsIntermediate();
conv_y_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_out);
};
return ExecuteHandleOnGraph<ProjectionFuseHandle>(
&gpd, graph_with_stats,
[this,
&conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_x_pattern, subgraph);
},
[this,
&conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_y_pattern, subgraph);
},
get_node_from_elementwise_add);
}
graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
FusePassBase::Init(name_scope_, graph.get());
auto fused_graph_with_stats = FuseConvAsY(
name_scope_,
FuseConvAsX(
name_scope_,
FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0))));
std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl;
AddStatis(fused_graph_with_stats.second);
return graph; return graph;
} }
} // namespace ir } // namespace ir
...@@ -151,4 +435,4 @@ graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { ...@@ -151,4 +435,4 @@ graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
} // namespace paddle } // namespace paddle
REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass, REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass); paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
...@@ -15,24 +15,119 @@ ...@@ -15,24 +15,119 @@
#pragma once #pragma once
#include <string> #include <string>
#include <tuple>
#include <utility>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include <boost/optional.hpp>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
class ConvElementwiseAddMKLDNNFusePass : public FusePassBase { using graph_ptr = std::unique_ptr<ir::Graph>;
using GraphWithStats = std::pair<ir::Graph*, int>;
void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
bool IsReachable(ir::Graph* graph, Node* from, Node* to);
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
class ResidualConnectionMKLDNNFusePass : public FusePassBase {
private:
GraphWithStats FuseConvAsX(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseConvAsY(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseProjectionConv(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
template <typename RetType>
using GetNodeFunc =
std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
using IdentityElementwiseAddFunc =
GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
using ProjectionConvFunc = IdentityConvFunc;
using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
using CanFuseFunc = std::function<bool(Node*, Node*)>;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
template <typename HandleType, typename... OpFuncs>
GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
const GraphWithStats& graph_with_stats,
OpFuncs&&... op_funcs) const {
ir::Graph* graph;
int stats;
std::tie(graph, stats) = graph_with_stats;
auto can_fuse = [this](Node* op1, Node* op2) -> bool {
return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
};
auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
(*gpd)(graph, fuse_handle);
return std::make_pair(graph, stats + fuse_handle.get_stats());
}
struct IdentityFuseHandle {
IdentityFuseHandle(
const CanFuseFunc& can_fuse_func,
const IdentityConvFunc& get_node_from_conv_op,
const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
IdentityConvFunc get_node_from_conv_op;
IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
};
struct ProjectionFuseHandle {
ProjectionFuseHandle(
const CanFuseFunc& can_fuse_func,
const ProjectionConvFunc& get_node_from_conv_x_op,
const ProjectionConvFunc& get_node_from_conv_y_op,
const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
ProjectionConvFunc get_node_from_conv_x_op;
ProjectionConvFunc get_node_from_conv_y_op;
ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
};
public: public:
virtual ~ConvElementwiseAddMKLDNNFusePass() {} virtual ~ResidualConnectionMKLDNNFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
const std::string name_scope_{"residual_connections_fuse_pass"}; const std::string name_scope_{"residual_connection_fuse_pass"};
}; };
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
op->SetOutput(output.first, {output.second}); op->SetOutput(output.first, {output.second});
} }
struct IsReachable { struct TestIsReachable {
using func = std::function<bool(const std::string&, const std::string&)>; using func = std::function<bool(const std::string&, const std::string&)>;
auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func { auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
...@@ -89,7 +89,9 @@ struct IsReachable { ...@@ -89,7 +89,9 @@ struct IsReachable {
} }
}; };
void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) { void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
int expected_conv_count,
int expected_elementwise_add_count = 0) {
int conv_count = 0; int conv_count = 0;
int elementwise_add_count = 0; int elementwise_add_count = 0;
...@@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) { ...@@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
++elementwise_add_count; ++elementwise_add_count;
} }
} }
EXPECT_EQ(conv_count, 1); EXPECT_EQ(conv_count, expected_conv_count);
EXPECT_EQ(elementwise_add_count, 0); EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
} }
ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars, ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
...@@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars, ...@@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
return prog; return prog;
} }
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog)); void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
const std::string& to, int expected_conv_num) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
IsReachable is_reachable; TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "relu")); EXPECT_TRUE(is_reachable(graph)(from, to));
auto pass = auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
...@@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) { ...@@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size(); int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu")); EXPECT_TRUE(is_reachable(graph)(from, to));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num); current_nodes_num);
AssertOpsCount(graph); AssertOpsCount(graph, expected_conv_num);
} }
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
ConvolutionWithElementwiseAddReluNoBias) { auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable; SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"});
EXPECT_TRUE(is_reachable(graph)("a", "relu")); SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
auto pass = RunPassAndAssert(&prog, "a", "relu", 1);
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); }
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu")); TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionAsYWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
current_nodes_num); SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
AssertOpsCount(graph); RunPassAndAssert(&prog, "a", "relu", 1);
} }
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) { TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"}); auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}}, {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"}); {"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog)); SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
IsReachable is_reachable; RunPassAndAssert(&prog, "a", "relu", 1);
EXPECT_TRUE(is_reachable(graph)("a", "d")); }
auto pass = TEST(ConvElementwiseAddMKLDNNFusePass,
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); ConvolutionAsXWithElementwiseAddReluNoBias) {
int original_nodes_num = graph->Nodes().size(); auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_FALSE(is_reachable(graph)("a", "d")); SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, RunPassAndAssert(&prog, "a", "relu", 1);
current_nodes_num);
AssertOpsCount(graph);
} }
TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) { TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
auto prog = auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"}); BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"}); {"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog)); SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
{"Output", "e"});
IsReachable is_reachable; SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
EXPECT_TRUE(is_reachable(graph)("a", "f")); std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "g"));
auto pass = auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
...@@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) { ...@@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size(); int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "f")); EXPECT_TRUE(is_reachable(graph)("a", "g"));
EXPECT_EQ(original_nodes_num, current_nodes_num);
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, AssertOpsCount(graph, 2, 1);
current_nodes_num);
AssertOpsCount(graph);
} }
} // namespace ir } // namespace ir
......
...@@ -15,8 +15,15 @@ limitations under the License. */ ...@@ -15,8 +15,15 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_set> #include <unordered_set>
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) { ...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes); graph_nodes.emplace_back(g_nodes);
} }
if (VLOG_IS_ON(100)) { if (FLAGS_print_sub_graph_dir.size()) {
VLOG(100) << "graph_num: " << graph_nodes.size(); if (graph_nodes.size() > 1) {
for (auto &g_n : graph_nodes) { std::stringstream out;
VLOG(100) << "graph_nodes: " << g_n.size(); for (auto &g_n : graph_nodes) {
if (g_n.size() < 10) { out << "graph_nodes: " << g_n.size() << "\n";
std::stringstream out; }
out << "\n\n";
for (auto &g_n : graph_nodes) {
out << "graph_nodes: " << g_n.size();
for (auto &node : g_n) { for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in ["; out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) { for (auto &n : node->inputs) {
...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) { ...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
} }
out << "]"; out << "]";
} }
VLOG(100) << out.str(); out << "\n\n\n";
} }
std::unique_ptr<std::ostream> fout(
new std::ofstream(FLAGS_print_sub_graph_dir));
PADDLE_ENFORCE(fout->good());
*fout << out.str();
} }
} }
......
...@@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() { ...@@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
return output_var; return output_var;
} }
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) { PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add"); ->assert_is_op("elementwise_add");
x_var->assert_is_op_input("elementwise_add", "X"); x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
auto y_var = pattern->NewNode(elementwise_add_x_repr())
->AsInput()
->assert_is_op_input("elementwise_add", "Y");
auto out_var = pattern->NewNode(elementwise_add_out_repr()) auto out_var = pattern->NewNode(elementwise_add_out_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("elementwise_add", "Out"); ->assert_is_op_output("elementwise_add", "Out");
......
...@@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase { ...@@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add") {} : PatternBase(pattern, name_scope, "elementwise_add") {}
PDNode* operator()(PDNode* x_var); PDNode* operator()(PDNode* x_var, PDNode* y_var);
PATTERN_DECL_NODE(elementwise_add_op); PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_x); PATTERN_DECL_NODE(elementwise_add_x);
......
...@@ -111,9 +111,6 @@ class LoDTensor : public Tensor { ...@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
public: public:
LoDTensor() : Tensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
void set_lod(const LoD& lod) { lod_ = lod; } void set_lod(const LoD& lod) { lod_ = lod; }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -31,46 +32,6 @@ namespace paddle { ...@@ -31,46 +32,6 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
...@@ -103,8 +64,6 @@ class Vector { ...@@ -103,8 +64,6 @@ class Vector {
o.ImmutableCPU(); o.ImmutableCPU();
cpu_ = o.cpu_; cpu_ = o.cpu_;
flag_ = kDataInCPU; flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this; return *this;
} }
...@@ -199,7 +158,7 @@ class Vector { ...@@ -199,7 +158,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place), PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place"); "CUDA Data must on CUDA place");
ImmutableCUDA(place); ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_); return reinterpret_cast<T *>(gpu_->ptr());
} }
// get cuda ptr. mutable // get cuda ptr. mutable
...@@ -234,13 +193,11 @@ class Vector { ...@@ -234,13 +193,11 @@ class Vector {
std::mutex &Mutex() const { return mtx_; } std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const { boost::optional<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) { return gpu_ == nullptr
return nullptr; ? boost::none
} else { : boost::optional<platform::CUDAPlace>(
return std::unique_ptr<platform::CUDAPlace>( boost::get<platform::CUDAPlace>(gpu_->place()));
new platform::CUDAPlace(gpu_.place_));
}
} }
private: private:
...@@ -254,13 +211,12 @@ class Vector { ...@@ -254,13 +211,12 @@ class Vector {
void CopyToCPU() const { void CopyToCPU() const {
// COPY GPU Data To CPU // COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(gpu_->place()));
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
void *src = gpu_.data_; void *src = gpu_->ptr();
void *dst = cpu_.data(); void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
stream); gpu_->size(), stream);
dev_ctx->Wait(); dev_ctx->Wait();
} }
...@@ -277,8 +233,7 @@ class Vector { ...@@ -277,8 +233,7 @@ class Vector {
CopyCPUDataToCUDA(place); CopyCPUDataToCUDA(place);
UnsetFlag(kDirty); UnsetFlag(kDirty);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (IsInCUDA() && } else if (IsInCUDA() && !(place == gpu_->place())) {
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen"); PADDLE_THROW("This situation should not happen");
// Still dirty // Still dirty
} else { } else {
...@@ -290,7 +245,7 @@ class Vector { ...@@ -290,7 +245,7 @@ class Vector {
// Even data is not dirty. However, data is not in CUDA. Copy data. // Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place); CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) { } else if (!(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen."); PADDLE_THROW("This situation should not happen.");
} else { } else {
// Not Dirty && DataInCUDA && Device is same // Not Dirty && DataInCUDA && Device is same
...@@ -301,13 +256,13 @@ class Vector { ...@@ -301,13 +256,13 @@ class Vector {
void CopyCPUDataToCUDA(const platform::Place &place) const { void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data(); void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T)); gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_; void *dst = gpu_->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
stream); gpu_->size(), stream);
} }
void ImmutableCPU() const { void ImmutableCPU() const {
...@@ -329,7 +284,7 @@ class Vector { ...@@ -329,7 +284,7 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; } bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_; mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_; mutable memory::AllocationPtr gpu_;
mutable int flag_; mutable int flag_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
...@@ -428,8 +383,8 @@ class Vector { ...@@ -428,8 +383,8 @@ class Vector {
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr || if (cuda_place == boost::none ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) { cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place); return m_.Data().CUDAData(place);
} }
} }
...@@ -444,8 +399,8 @@ class Vector { ...@@ -444,8 +399,8 @@ class Vector {
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr || if (cuda_place == boost::none ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) { cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place); return m_.MutableData()->CUDAMutableData(place);
} }
} }
......
...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
} }
// If the loss_var_name is given, the number of graph should be only one. // If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) { if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, size_t graph_num = ir::GraphNum(*graph);
"The number of graph should be only one"); if (graph_num > 1) {
LOG(WARNING)
<< "The number of graph should be only one, "
"but the current graph has "
<< ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
"please don't pass loss_var_name.";
}
} }
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
......
...@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const { ...@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
} }
void* Tensor::mutable_data(platform::Place place, std::type_index type, void* Tensor::mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr,
size_t requested_size) { size_t requested_size) {
if (holder_ != nullptr) { type_ = type;
holder_->set_type(type);
}
PADDLE_ENFORCE_GE(numel(), 0, PADDLE_ENFORCE_GE(numel(), 0,
"When calling this method, the Tensor's numel must be " "When calling this method, the Tensor's numel must be "
"equal or larger than zero. " "equal or larger than zero. "
...@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, ...@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { holder_ = memory::AllocShared(place, size, attr);
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW(
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
}
#else
if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
} else if (platform::is_cuda_pinned_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
boost::get<platform::CUDAPinnedPlace>(place), size, type));
}
}
#endif
offset_ = 0; offset_ = 0;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
void* Tensor::mutable_data(platform::Place place, size_t requested_size) { void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
size_t requested_size) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing."); "Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type(), requested_size); return mutable_data(place, type_, attr, requested_size);
} }
Tensor& Tensor::ShareDataWith(const Tensor& src) { Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const { ...@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
Tensor dst; Tensor dst;
dst.holder_ = holder_; dst.holder_ = holder_;
dst.set_layout(layout_); dst.set_layout(layout_);
dst.type_ = type_;
DDim dst_dims = dims_; DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx; dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims); dst.Resize(dst_dims);
......
...@@ -67,12 +67,7 @@ class Tensor { ...@@ -67,12 +67,7 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0) {} Tensor() : type_(typeid(float)), offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
...@@ -89,12 +84,17 @@ class Tensor { ...@@ -89,12 +84,17 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(platform::Place place, size_t requested_size = 0); T* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
void* mutable_data(platform::Place place, std::type_index type, void* mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0); size_t requested_size = 0);
void* mutable_data(platform::Place place, size_t requested_size = 0); void* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -106,7 +106,9 @@ class Tensor { ...@@ -106,7 +106,9 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0); T* mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
const DDim& dims() const; const DDim& dims() const;
...@@ -139,7 +141,7 @@ class Tensor { ...@@ -139,7 +141,7 @@ class Tensor {
std::type_index type() const { std::type_index type() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor not initialized yet when Tensor::type() is called."); holder_, "Tensor not initialized yet when Tensor::type() is called.");
return holder_->type(); return type_;
} }
// memory size returns the holding memory size in byte. // memory size returns the holding memory size in byte.
...@@ -153,56 +155,13 @@ class Tensor { ...@@ -153,56 +155,13 @@ class Tensor {
void clear() { holder_ = nullptr; } void clear() { holder_ = nullptr; }
private: const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
/** size_t offset() const { return offset_; }
* @note Placeholder hides type T, so it doesn't appear as a template
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() = default;
virtual void* ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
};
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */
platform::Place place_;
/*! the size of memory block. */
size_t size_;
/* the current type of memory */
std::type_index type_;
};
private:
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_; std::shared_ptr<memory::Allocation> holder_;
std::type_index type_;
/** /**
* @brief points to elements dimensions. * @brief points to elements dimensions.
* *
......
...@@ -23,10 +23,10 @@ namespace framework { ...@@ -23,10 +23,10 @@ namespace framework {
template <typename T> template <typename T>
inline const T* Tensor::data() const { inline const T* Tensor::data() const {
check_memory_size(); check_memory_size();
bool valid = std::is_same<T, void>::value || bool valid =
holder_->type() == std::type_index(typeid(T)); std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name()); type_.name());
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
...@@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } ...@@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T> template <typename T>
inline T* Tensor::data() { inline T* Tensor::data() {
check_memory_size(); check_memory_size();
bool valid = std::is_same<T, void>::value || bool valid =
holder_->type() == std::type_index(typeid(T)); std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name()); type_.name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place, inline T* Tensor::mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) { size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place, requested_size); return mutable_data<T>(place, attr, requested_size);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { inline T* Tensor::mutable_data(platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size)); return reinterpret_cast<T*>(
mutable_data(place, typeid(T), attr, requested_size));
} }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
......
...@@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) { ...@@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
TensorToStream(oss, gpu_tensor, gpu_ctx); TensorToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str()); std::istringstream iss(oss.str());
TensorFromStream(iss, &dst_tensor, gpu_ctx); TensorFromStream(
iss, &dst_tensor,
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace()); int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
......
...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
for (size_t index = 0; index < block_desc.OpSize(); index++) { for (size_t index = 0; index < block_desc.OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index]; auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
......
...@@ -45,8 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { ...@@ -45,8 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout", "split", "prelu", "conv2d_transpose", "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"leaky_relu"}); "conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false; if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) { if (teller_set.count(node->Op()->Type())) {
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
...@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc ...@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
elementwise_add_op elementwise_mul_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
...@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc ...@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL) DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL) split_op concat_op SERIAL)
nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
prelu_op SERIAL) prelu_op SERIAL)
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
static bool CheckDims(const nvinfer1::Dims& dims_x,
const nvinfer1::Dims& dims_y) {
if (dims_x.nbDims != dims_y.nbDims) {
return false;
}
for (int i = 0; i < dims_x.nbDims; i++) {
if (dims_x.d[i] != dims_y.d[i]) {
return false;
}
}
return true;
}
class ElementwiseWeightOpConverter : public OpConverter { class ElementwiseWeightOpConverter : public OpConverter {
public: public:
ElementwiseWeightOpConverter() {} ElementwiseWeightOpConverter() {}
...@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
...@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
ElementwiseTensorOpConverter() {} ElementwiseTensorOpConverter() {}
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
auto op_pair = ops.find(op_type_);
PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
...@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_x = X->getDimensions();
nvinfer1::Dims dims_y = Y->getDimensions(); nvinfer1::Dims dims_y = Y->getDimensions();
// The two input tensor should have the same dims int axis = boost::get<int>(op_desc.GetAttr("axis"));
PADDLE_ENFORCE(dims_x.nbDims >= 3); auto output_name = op_desc.Output("Out")[0];
if (dims_x.nbDims == dims_y.nbDims) { if (CheckDims(dims_x, dims_y)) {
for (int i = 0; i < dims_x.nbDims; i++) { // The two input tensor should have the same dims
if (dims_x.d[i] != dims_y.d[i]) VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
} else {
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
auto op_pair = ops.find(op_type_); nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
if (op_pair == ops.end()) { engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
PADDLE_THROW("Wrong elementwise op type!"); *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
}
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0]; layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->setName(("elementwise (Output: " + output_name + ")").c_str()); layer->getOutput(0)->setName(output_name.c_str());
layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0));
engine_->SetITensor(output_name, layer->getOutput(0)); } else {
VLOG(3) << "Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer";
plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
plugin->AddInput(X);
plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
reinterpret_cast<plugin::PluginTensorRT*>(plugin));
layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
}
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -61,7 +61,7 @@ class OpConverter { ...@@ -61,7 +61,7 @@ class OpConverter {
// TODO(xingzhaolong): all mul, sub, div // TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul", // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"}; // "sub", "div"};
static std::unordered_set<std::string> add_weight_op_set{"add"}; static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
int op_type_len = op_desc.Type().size(); int op_type_len = op_desc.Type().size();
std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
......
...@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter { ...@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data), static_cast<void*>(alpha_data),
alpha_tensor_device->numel()); alpha_tensor_device->numel());
PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode); plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
nvinfer1::IPluginLayer* layer = nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin); engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory // keep alpha tensor to avoid release it's memory
......
...@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter { ...@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
PADDLE_ENFORCE(output_lengths.size() == output_num); PADDLE_ENFORCE(output_lengths.size() == output_num);
// //
SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
nvinfer1::IPluginLayer* layer = nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin); engine_->AddPlugin(&input, input_num, plugin);
......
...@@ -20,13 +20,12 @@ namespace paddle { ...@@ -20,13 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
TEST(elementwise_op, add_weight_test) { TEST(elementwise_op, add_weight) {
std::unordered_set<std::string> parameters({"elementwise_add-Y"}); std::unordered_set<std::string> parameters({"elementwise_add-Y"});
framework::Scope scope; framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1 << 15); TRTConvertValidation validator(10, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description // Prepare Op description
...@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) { ...@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
validator.Execute(8); validator.Execute(8);
} }
TEST(elementwise_op, add_tensor_test) { TEST(elementwise_op, native) {
std::unordered_set<std::string> parameters; for (std::string type : {"add", "mul"}) {
framework::Scope scope; int batch_size = 8;
TRTConvertValidation validator(8, parameters, scope, 1 << 15); std::unordered_set<std::string> parameters;
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); framework::Scope scope;
validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3)); TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); validator.DeclInputVar("elementwise_" + type + "-X",
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_" + type + "-Y",
// Prepare Op description nvinfer1::Dims3(10, 3, 3));
framework::OpDesc desc; validator.DeclOutputVar("elementwise_" + type + "-Out",
desc.SetType("elementwise_add"); nvinfer1::DimsCHW(10, 3, 3));
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"}); // Prepare Op description
desc.SetOutput("Out", {"elementwise_add-Out"}); framework::OpDesc desc;
desc.SetType("elementwise_" + type);
// the defalut axis of elementwise op is -1 desc.SetInput("X", {"elementwise_" + type + "-X"});
desc.SetInput("Y", {"elementwise_" + type + "-Y"});
validator.SetOp(*desc.Proto()); desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
int axis = -1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(batch_size);
}
}
validator.Execute(8); TEST(elementwise_op, plugin) {
for (std::string type : {"add", "mul"}) {
int batch_size = 8;
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_" + type + "-X",
nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_" + type + "-Y",
nvinfer1::Dims3(10, 1, 1));
validator.DeclOutputVar("elementwise_" + type + "-Out",
nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_" + type);
desc.SetInput("X", {"elementwise_" + type + "-X"});
desc.SetInput("Y", {"elementwise_" + type + "-Y"});
desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
int axis = -1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(batch_size);
}
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(elementwise_add); USE_OP(elementwise_add);
USE_OP(elementwise_mul);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
......
...@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() { ...@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
} }
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { nvinfer1::ITensor *const *inputs, int num_inputs,
plugin::PluginTensorRT *plugin) {
owned_plugin_.emplace_back(plugin); owned_plugin_.emplace_back(plugin);
return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase { ...@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
int GetRuntimeBatch(); int GetRuntimeBatch();
int GetDevice() { return device_; } int GetDevice() { return device_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int nbInputs, PluginTensorRT*); int num_inputs, plugin::PluginTensorRT*);
// A pointer to CPU memory is needed of the TRT weight. // A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage. // Before TRT runs, fluid loads weight into GPU storage.
...@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase { ...@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
// The specific GPU id that the TensorRTEngine bounded to. // The specific GPU id that the TensorRTEngine bounded to.
int device_; int device_;
std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_; std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
// TensorRT related internal members // TensorRT related internal members
template <typename T> template <typename T>
......
nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce tensorrt_engine) nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu
DEPS enforce tensorrt_engine)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
namespace details {
template <typename T>
struct Add {
__device__ T operator()(const T& a, const T& b) const { return a + b; }
};
template <typename T>
struct Mul {
__device__ T operator()(const T& a, const T& b) const { return a * b; }
};
template <typename T, typename Operator>
__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
int batch_size, int num_rows, int num_cols) {
for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
int row = blockIdx.x;
for (; row < num_rows; row += gridDim.x) {
T value_y = y[batch_id * num_rows + row];
int col = threadIdx.x;
int offset = (batch_id * num_rows + row) * num_cols;
for (; col < num_cols; col += blockDim.x) {
T value_x = x[offset + col];
out[offset + col] = op(value_x, value_y);
}
}
}
}
template <typename T, typename Operator>
static void ElementWise(Operator op, const T* x, const T* y, T* out,
int batch_size, int prev, int midd, int post,
cudaStream_t stream) {
const int kThreadsPerBlock = 1024;
const int kMaximumBlocks = 65535;
if (prev == 1) {
int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
: (((post + 31) >> 5) << 5);
int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
op, x, y, out, batch_size, midd, post);
} else if (post == 1) {
PADDLE_THROW("Not implemented.");
} else {
PADDLE_THROW("Not implemented.");
}
}
} // namespace details
nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims* input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0);
PADDLE_ENFORCE_EQ(num_inputs, 2);
PADDLE_ENFORCE_NOT_NULL(input_dims);
return input_dims[0];
}
int ElementWisePlugin::initialize() {
PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
int trimed_nb_dims = dims_y_.nbDims;
for (; trimed_nb_dims > 0; --trimed_nb_dims) {
if (dims_y_.d[trimed_nb_dims - 1] != 1) {
break;
}
}
dims_y_.nbDims = trimed_nb_dims;
PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
prev_size_ = 1;
midd_size_ = 1;
post_size_ = 1;
for (int i = 0; i < axis_; ++i) {
prev_size_ *= dims_x_.d[i];
}
for (int i = 0; i < dims_y_.nbDims; ++i) {
PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
"Broadcast dimension mismatch.");
midd_size_ *= dims_y_.d[i];
}
for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
post_size_ *= dims_x_.d[i];
}
return 0;
}
int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
void** outputs, void* workspace,
cudaStream_t stream) {
const float* x = reinterpret_cast<const float*>(inputs[0]);
const float* y = reinterpret_cast<const float*>(inputs[1]);
float* out = reinterpret_cast<float*>(outputs[0]);
if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
details::ElementWise(details::Add<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream);
} else {
PADDLE_THROW("Not implemented.");
}
return cudaGetLastError() != cudaSuccess;
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class ElementWisePlugin : public PluginTensorRT {
public:
ElementWisePlugin(nvinfer1::ElementWiseOperation type,
nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
int axis)
: type_(type),
dims_x_(dims_x),
dims_y_(dims_y),
axis_(axis),
prev_size_(1),
midd_size_(1),
post_size_(1) {}
ElementWisePlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_);
}
ElementWisePlugin *clone() const override {
// return new ElementWisePlugin(dims_x_, dims_y_, axis_);
return nullptr;
}
const char *getPluginType() const override { return "elementwise"; }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
int num_inputs) override;
int initialize() override;
// execute the layer
int enqueue(int batch_size, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream);
protected:
size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(dims_x_) +
SerializedSize(dims_y_) + getBaseSerializationSize();
}
void serialize(void *buffer) override {
serializeBase(buffer);
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_);
}
nvinfer1::ElementWiseOperation type_;
nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_;
int axis_;
int prev_size_;
int midd_size_;
int post_size_;
};
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
static const int CUDA_NUM_THREADS = 1024; static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535; static const int CUDA_MAX_NUM_BLOCKS = 65535;
...@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs, ...@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class PReluPlugin : public PluginTensorRT { class PReluPlugin : public PluginTensorRT {
TensorRTEngine::Weight alpha_; TensorRTEngine::Weight alpha_;
...@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT { ...@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
void *workspace, cudaStream_t stream) override; void *workspace, cudaStream_t stream) override;
}; };
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -14,10 +14,15 @@ ...@@ -14,10 +14,15 @@
#pragma once #pragma once
#include <cassert>
#include <cstring> #include <cstring>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
template <typename T> template <typename T>
inline void SerializeValue(void** buffer, T const& value); inline void SerializeValue(void** buffer, T const& value);
...@@ -26,7 +31,7 @@ template <typename T> ...@@ -26,7 +31,7 @@ template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size, inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value); T* value);
namespace { namespace details {
template <typename T, class Enable = void> template <typename T, class Enable = void>
struct Serializer {}; struct Serializer {};
...@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value || ...@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value || std::is_enum<T>::value ||
std::is_pod<T>::value>::type> { std::is_pod<T>::value>::type> {
static size_t SerializedSize(T const& value) { return sizeof(T); } static size_t SerializedSize(T const& value) { return sizeof(T); }
static void Serialize(void** buffer, T const& value) { static void Serialize(void** buffer, T const& value) {
std::memcpy(*buffer, &value, sizeof(T)); std::memcpy(*buffer, &value, sizeof(T));
reinterpret_cast<char*&>(*buffer) += sizeof(T); reinterpret_cast<char*&>(*buffer) += sizeof(T);
} }
static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
assert(*buffer_size >= sizeof(T)); assert(*buffer_size >= sizeof(T));
std::memcpy(value, *buffer, sizeof(T)); std::memcpy(value, *buffer, sizeof(T));
...@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value || ...@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
template <> template <>
struct Serializer<const char*> { struct Serializer<const char*> {
static size_t SerializedSize(const char* value) { return strlen(value) + 1; } static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
static void Serialize(void** buffer, const char* value) { static void Serialize(void** buffer, const char* value) {
std::strcpy(static_cast<char*>(*buffer), value); std::strcpy(static_cast<char*>(*buffer), value); // NOLINT
reinterpret_cast<char*&>(*buffer) += strlen(value) + 1; reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
} }
static void Deserialize(void const** buffer, size_t* buffer_size, static void Deserialize(void const** buffer, size_t* buffer_size,
const char** value) { const char** value) {
*value = static_cast<char const*>(*buffer); *value = static_cast<char const*>(*buffer);
...@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>, ...@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
static size_t SerializedSize(std::vector<T> const& value) { static size_t SerializedSize(std::vector<T> const& value) {
return sizeof(value.size()) + value.size() * sizeof(T); return sizeof(value.size()) + value.size() * sizeof(T);
} }
static void Serialize(void** buffer, std::vector<T> const& value) { static void Serialize(void** buffer, std::vector<T> const& value) {
SerializeValue(buffer, value.size()); SerializeValue(buffer, value.size());
size_t nbyte = value.size() * sizeof(T); size_t nbyte = value.size() * sizeof(T);
std::memcpy(*buffer, value.data(), nbyte); std::memcpy(*buffer, value.data(), nbyte);
reinterpret_cast<char*&>(*buffer) += nbyte; reinterpret_cast<char*&>(*buffer) += nbyte;
} }
static void Deserialize(void const** buffer, size_t* buffer_size, static void Deserialize(void const** buffer, size_t* buffer_size,
std::vector<T>* value) { std::vector<T>* value) {
size_t size; size_t size;
DeserializeValue(buffer, buffer_size, &size); DeserializeValue(buffer, buffer_size, &size);
value->resize(size); value->resize(size);
size_t nbyte = value->size() * sizeof(T); size_t nbyte = value->size() * sizeof(T);
assert(*buffer_size >= nbyte); PADDLE_ENFORCE_GE(*buffer_size, nbyte);
std::memcpy(value->data(), *buffer, nbyte); std::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte; reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte; *buffer_size -= nbyte;
} }
}; };
} // namespace } // namespace details
template <typename T> template <typename T>
inline size_t SerializedSize(T const& value) { inline size_t SerializedSize(T const& value) {
return Serializer<T>::SerializedSize(value); return details::Serializer<T>::SerializedSize(value);
} }
template <typename T> template <typename T>
inline void SerializeValue(void** buffer, T const& value) { inline void SerializeValue(void** buffer, T const& value) {
return Serializer<T>::Serialize(buffer, value); return details::Serializer<T>::Serialize(buffer, value);
} }
template <typename T> template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size, inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value) { T* value) {
return Serializer<T>::Deserialize(buffer, buffer_size, value); return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
} }
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -12,26 +12,26 @@ ...@@ -12,26 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <stdio.h>
#include <cassert>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, nvinfer1::Dims SplitPlugin::getOutputDimensions(
const nvinfer1::Dims* inputDims, int index, const nvinfer1::Dims* input_dims, int num_inputs) {
int nbInputs) { PADDLE_ENFORCE_EQ(num_inputs, 1);
assert(nbInputs == 1); PADDLE_ENFORCE_LT(index, this->getNbOutputs());
assert(index < this->getNbOutputs());
nvinfer1::Dims const& input_dims = inputDims[0]; nvinfer1::Dims output_dims = input_dims[0];
nvinfer1::Dims output_dims = input_dims;
output_dims.d[axis_] = output_length_.at(index); output_dims.d[axis_] = output_length_.at(index);
return output_dims; return output_dims;
} }
int SplitPlugin::initialize() { int SplitPlugin::initialize() {
PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
std::vector<int> segment_offsets(1, 0); std::vector<int> segment_offsets(1, 0);
for (int i = 0; i < this->getNbOutputs(); ++i) { for (int i = 0; i < this->getNbOutputs(); ++i) {
segment_offsets.push_back(segment_offsets.back() + output_length_[i]); segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
...@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
} // tensorrt } // namespace plugin
} // inference } // namespace tensorrt
} // paddle } // namespace inference
} // namespace paddle
...@@ -14,61 +14,58 @@ ...@@ -14,61 +14,58 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class SplitPlugin : public PluginTensorRT { class SplitPlugin : public PluginTensorRT {
int axis_; public:
std::vector<int> output_length_; SplitPlugin(int axis, std::vector<int> const &output_lengths)
int nx_, ny_, nz_; : axis_(axis), output_length_(output_lengths) {}
std::vector<int> segment_offsets_;
SplitPlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &output_length_);
}
SplitPlugin *clone() const override {
return new SplitPlugin(axis_, output_length_);
}
const char *getPluginType() const override { return "split"; }
int getNbOutputs() const override { return output_length_.size(); }
nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims,
int num_inputs) override;
int initialize() override;
int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
protected: protected:
virtual size_t getSerializationSize() override { size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(output_length_) + return SerializedSize(axis_) + SerializedSize(output_length_) +
getBaseSerializationSize(); getBaseSerializationSize();
} }
// TRT will call this func when we need to serialize the configuration of void serialize(void *buffer) override {
// tensorrt.
// It should not be called by users.
virtual void serialize(void *buffer) override {
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, axis_); SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_); SerializeValue(&buffer, output_length_);
} }
public: int axis_;
SplitPlugin(int axis, std::vector<int> const &output_lengths) std::vector<int> output_length_;
: axis_(axis), output_length_(output_lengths) { int nx_, ny_, nz_;
assert(axis <= nvinfer1::Dims::MAX_DIMS); std::vector<int> segment_offsets_;
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin(void const *serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &axis_);
DeserializeValue(&serialData, &serialLength, &output_length_);
}
SplitPlugin *clone() const override {
return new SplitPlugin(axis_, output_length_);
}
virtual const char *getPluginType() const override { return "split"; }
virtual int getNbOutputs() const override { return output_length_.size(); }
virtual nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *inputs,
int nbInputDims) override;
virtual int initialize() override;
virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
}; };
} // tensorrt } // namespace plugin
} // inference } // namespace tensorrt
} // paddle } // namespace inference
} // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
void PluginTensorRT::serializeBase(void*& buffer) { void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, input_dims_); SerializeValue(&buffer, input_dims_);
...@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) { ...@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, data_format_); SerializeValue(&buffer, data_format_);
} }
void PluginTensorRT::deserializeBase(void const*& serialData, void PluginTensorRT::deserializeBase(void const*& serial_data,
size_t& serialLength) { size_t& serial_length) {
DeserializeValue(&serialData, &serialLength, &input_dims_); DeserializeValue(&serial_data, &serial_length, &input_dims_);
DeserializeValue(&serialData, &serialLength, &max_batch_size_); DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
DeserializeValue(&serialData, &serialLength, &data_type_); DeserializeValue(&serial_data, &serial_length, &data_type_);
DeserializeValue(&serialData, &serialLength, &data_format_); DeserializeValue(&serial_data, &serial_length, &data_format_);
} }
size_t PluginTensorRT::getBaseSerializationSize() { size_t PluginTensorRT::getBaseSerializationSize() {
...@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, ...@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
} }
void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims, void PluginTensorRT::configureWithFormat(
int nbInputs, const nvinfer1::Dims* input_dims, int num_inputs,
const nvinfer1::Dims* outputDims, const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int max_batch_size) {
nvinfer1::PluginFormat format,
int maxBatchSize) {
data_type_ = type; data_type_ = type;
data_format_ = format; data_format_ = format;
input_dims_.assign(inputDims, inputDims + nbInputs); input_dims_.assign(input_dims, input_dims + num_inputs);
max_batch_size_ = maxBatchSize; max_batch_size_ = max_batch_size;
} }
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -14,23 +14,30 @@ ...@@ -14,23 +14,30 @@
#pragma once #pragma once
#include <cassert> #include <NvInfer.h>
#include <cstring> #include <cstring>
#include <iostream>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin {
class PluginTensorRT : public nvinfer1::IPluginExt { class PluginTensorRT : public nvinfer1::IPluginExt {
public: public:
PluginTensorRT() {} PluginTensorRT() {}
// It was used for TensorRT deserialization.
// It should not be called by users.
PluginTensorRT(const void* serialized_data, size_t length) {} PluginTensorRT(const void* serialized_data, size_t length) {}
virtual ~PluginTensorRT() {}
nvinfer1::Dims const& getInputDims(int index) const { nvinfer1::Dims const& getInputDims(int index) const {
return input_dims_.at(index); return input_dims_.at(index);
} }
...@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt { ...@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
nvinfer1::DataType getDataType() const { return data_type_; } nvinfer1::DataType getDataType() const { return data_type_; }
nvinfer1::PluginFormat getDataFormat() const { return data_format_; } nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
virtual const char* getPluginVersion() const { return "1"; } virtual const char* getPluginVersion() const { return "1"; }
void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
virtual nvinfer1::IPluginExt* clone() const = 0;
virtual const char* getPluginType() const = 0;
// Following functions are inherit from nvinfer1::IPluginExt
// Get the number of outputs from the layer
int getNbOutputs() const { return 1; }
// Get the dimension of an output tensor
virtual nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims* input_dims,
int num_inputs) = 0;
// Find the workspace size required by the layer
size_t getWorkspaceSize(int) const override { return 0; } size_t getWorkspaceSize(int) const override { return 0; }
// Initialize the layer for execution.
// This is called when the engine is created.
int initialize() override { return 0; }
// Shutdown the layer. This is called when the engine is destroyed
void terminate() override {} void terminate() override {}
virtual ~PluginTensorRT() {} // Execute the layer
virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) = 0;
// Find the size of the serialization buffer required
virtual size_t getSerializationSize() = 0;
// Serialize the layer config to buffer.
// TensorRT will call this func to serialize the configuration of TensorRT
// engine. It should not be called by users.
virtual void serialize(void* buffer) = 0;
// Check format support. The default is FLOAT32 and NCHW. // Check format support. The default is FLOAT32 and NCHW.
bool supportsFormat(nvinfer1::DataType type, bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const override; nvinfer1::PluginFormat format) const override;
void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, // Configure the layer
const nvinfer1::Dims* outputDims, int nbOutputs, void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
const nvinfer1::Dims* output_dims, int num_outputs,
nvinfer1::DataType type, nvinfer1::DataType type,
nvinfer1::PluginFormat format, nvinfer1::PluginFormat format,
int maxBatchSize) override; int max_batch_size) override;
// *NOTE* The following functions need to be overrided in the subclass.
virtual nvinfer1::IPluginExt* clone() const = 0;
virtual const char* getPluginType() const = 0;
// Initialize the layer for execution. This is called when the engine is
// created.
int initialize() override { return 0; }
// Serialize the layer config to buffer.
virtual void serialize(void* buffer) = 0;
virtual size_t getSerializationSize() = 0;
virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) = 0;
protected: protected:
// Deserialize input_dims, max_batch_size, data_type, data_format // Deserialize input_dims, max_batch_size, data_type, data_format
void deserializeBase(void const*& serialData, size_t& serialLength); void deserializeBase(void const*& serial_data, // NOLINT
size_t& serial_length); // NOLINT
size_t getBaseSerializationSize(); size_t getBaseSerializationSize();
// Serialize input_dims, max_batch_size, data_type, data_format // Serialize input_dims, max_batch_size, data_type, data_format
void serializeBase(void*& buffer); void serializeBase(void*& buffer); // NOLINT
std::vector<nvinfer1::Dims> input_dims_; std::vector<nvinfer1::Dims> input_dims_;
size_t max_batch_size_; size_t max_batch_size_;
nvinfer1::DataType data_type_; nvinfer1::DataType data_type_;
nvinfer1::PluginFormat data_format_; nvinfer1::PluginFormat data_format_;
std::vector<nvinfer1::ITensor*> inputs_;
}; };
} // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { ...@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config); LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
return; return;
} }
LOG(INFO) << *config; LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
......
add_subdirectory(detail) add_subdirectory(detail)
add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce) cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory cc_library(memory
DEPS DEPS
malloc malloc
memcpy) memcpy)
cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
#if (WITH_GPU) #if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif() #endif()
cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
if (WITH_GPU)
nv_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc
best_fit_allocator_test.cu
DEPS best_fit_allocator
locked_allocator
cpu_allocator
cuda_allocator
device_context
memcpy)
else()
cc_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc
DEPS best_fit_allocator
locked_allocator
cpu_allocator)
endif()
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
else ()
set(AllocatorFacadeDeps)
endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
allocator_strategy
legacy_allocator
)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
ThinAlignedAllocator::ThinAlignedAllocator(
std::shared_ptr<Allocator> underlyning_allocator)
: underlying_allocator_(std::move(underlyning_allocator)) {}
bool ThinAlignedAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// The aligned allocation and allocator will wrap a managed allocator,
// and returns the aligned pointer.
//
// NOTE(yy): For speed reason, I just use a template parameter to get
// alignment, however, it can be an private member if necessary.
//
// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
template <size_t kAlignment>
class AlignedAllocation : public Allocation {
static_assert(kAlignment > 0 && (kAlignment & (kAlignment - 1)) == 0,
"kAlignment must be 2^N");
public:
AlignedAllocation(AllocationPtr&& underlying_allocation, size_t size)
: Allocation(AlignedPtr(underlying_allocation->ptr()),
size + kAlignment - Offset(underlying_allocation->ptr()),
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
private:
static void* AlignedPtr(void* ptr) {
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
Offset(ptr));
}
// Offset to aligned pointer.
// if ptr is already aligned, returns 0.
static size_t Offset(void* ptr) {
auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
intptr_t diff = aligned_addr - ptr_addr;
if (diff == 0) {
return 0;
} else {
return kAlignment + diff;
}
}
AllocationPtr underlying_allocation_;
};
// Thin aligned allocator is trivial and used to generate a small size binary.
//
// NOTE(yy): This is a trick to make a template class. This class extract the
// common code into a `thin` class. So if there are multiple specification of
// the template class, the binary size will not extended too much.
//
// NOTE(yy): This could be an over design. If it harms readability of code, it
// could be removed later.
class ThinAlignedAllocator : public Allocator {
public:
explicit ThinAlignedAllocator(
std::shared_ptr<Allocator> underlyning_allocator);
bool IsAllocThreadSafe() const;
protected:
std::shared_ptr<Allocator> underlying_allocator_;
};
// An aligned allocator will allocate `size+kAlignment` allocation and adjust
// the pointer offset.
template <size_t kAlignment>
class AlignedAllocator : public ThinAlignedAllocator {
public:
using ThinAlignedAllocator::ThinAlignedAllocator;
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
auto raw_allocation =
underlying_allocator_->Allocate(size + kAlignment, attr);
return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
}
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#include "unsupported/Eigen/CXX11/Tensor"
// NOTE(yy): this unittest is not important. It just used for debugging.
// It can be removed later.
struct FillZero {
public:
float* ptr_;
__device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
};
namespace paddle {
TEST(Eigen, main) {
framework::Tensor tensor;
platform::CUDAPlace gpu(0);
float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(gpu));
PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
for_range(FillZero{ptr});
dev_ctx.Wait();
auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
auto& eigen_dev = *dev_ctx.eigen_device();
eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class AllocationWithUnderlying : public Allocation {
public:
explicit AllocationWithUnderlying(AllocationPtr allocation)
: Allocation(allocation->ptr(), allocation->size(), allocation->place()),
allocation_(std::move(allocation)) {}
AllocationPtr allocation_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator.h"
#include <functional>
namespace paddle {
namespace memory {
namespace allocation {
Allocation::~Allocation() {}
Allocator::~Allocator() {}
bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr);
ptr->set_allocator(this);
return AllocationPtr(ptr);
}
void Allocator::Free(Allocation* allocation) { delete allocation; }
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const {
auto* allocator = allocation->allocator();
allocator->Free(allocation);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
// Exception when `Alloc`/`AllocShared` failed
class BadAlloc : public std::exception {
public:
explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
const char* what() const noexcept override;
private:
std::string msg_;
};
class Allocation;
class AllocationDeleter {
public:
void operator()(Allocation* allocation) const;
};
class Allocator;
// Allocation is the object holding the actually pointer. Use
// `Allocation::ptr()` will returns the pointer that allocated.
//
// NOTE: this is the base class of Allocation. Each allocator can use its own
// allocation object.
// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
class Allocation {
public:
Allocation(void* ptr, size_t size, platform::Place place)
: allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
// Returns the holding pointer.
// NOTE: For performance consideration, it is better not to make this method
// as a virtual method. If we want to implement a `defragmentation` later,
// we might need to make `ptr_` field as a protected field, and add a virtual
// method like `defragmentation` to change `ptr_`.
void* ptr() const { return ptr_; }
// Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
// last valid element.
//
// NOTE: Some allocator might alloc more memory than request. The size
// could larger than its request. For example,
// the AlignedAllocator will always allocate memory as size + kAlignment.
// The raw pointer might not aligned, so an offset might be added to raw
// the pointer. The size of this allocation will be
// `size + kAlignemnt - offset`.
size_t size() const { return size_; }
const platform::Place& place() const { return place_; }
Allocator* allocator() { return allocator_; }
void set_allocator(Allocator* allocator) { allocator_ = allocator; }
virtual ~Allocation();
private:
Allocator* allocator_;
void* ptr_;
size_t size_;
platform::Place place_;
};
using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
// Base interface class of memory Allocator.
// To allocate a memory, allocator needs two parameters:
// 1. size of bytes.
// 2. Attribute of memory.
// NOTE: the attribute of memory might be ignored if the allocator does not
// care it.
class Allocator {
public:
enum Attr {
kDefault = 0, // Default attribute. Uses the fast or stablest allocation
// algorithm.
kFixedHuge = 1, // The allocation may not be freed until the program
// ends. e.g., `Parameters` and `Momentum`.
kFluxHuge = 2, // The allocation may create and freed frequently and the
// allocation is considerable huge. Like `activations`
// and gradients.
kScratchpad =
3, // The `Scratchpad` memory is allocated and freed very soon,
// usually within an operator or aux memory.
// Like CUDNN workspace, AUX memory in batch norm, etc.
//
// https://en.wikipedia.org/wiki/Scratchpad_memory
kCrossDevice =
4, // The memory used cross-device memory copy/communication.
// For example:
// 1. it can use an `pinned` memory for CPU-GPU
// communication.
// 2. it can use an `registered` memory for RDMA
// communication.
NumOfAttrs = 5 // The number of all attributes. It is used internally.
};
virtual ~Allocator();
// Allocate an allocation.
AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const;
protected:
virtual void Free(Allocation* allocation);
virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
private:
friend class AllocationDeleter;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator.h"
#include <gflags/gflags.h>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/conditional_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
DEFINE_int64(
gpu_allocator_retry_time, 0,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0");
namespace paddle {
namespace memory {
namespace allocation {
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator {
public:
CPUManagedAllocator() : normal_allocator_(new CPUAllocator()) {}
bool IsAllocThreadSafe() const override { return true; }
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
return normal_allocator_->Allocate(size, attr).release();
}
private:
std::shared_ptr<Allocator> normal_allocator_;
};
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class ChunkedAllocator : public Allocator {
public:
explicit ChunkedAllocator(std::unique_ptr<Allocator> system_allocator,
size_t max_chunk_size, size_t capacity = 1,
int64_t retry_time = -1)
: max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
raw_allocator_ = std::move(system_allocator);
if (max_chunk_size_ == 0) {
default_allocator_ = raw_allocator_;
} else {
if (capacity == 1) {
VLOG(10) << "Create BestFitAllocator with chunk_size "
<< max_chunk_size_;
default_allocator_ = CreateAllocatorWithChunk();
} else {
VLOG(10) << "Create AutoIncrementAllocator with chunk_size "
<< max_chunk_size_ << " and capacity " << capacity;
default_allocator_ = std::make_shared<AutoIncrementAllocator>(
[this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
}
}
auto* cond_allocator = new ConditionalAllocator();
cond_allocator
->AddAllocator(
[this](size_t size, Attr attr) { return size < max_chunk_size_; },
default_allocator_)
.AddAllocator(
[](size_t size, Attr attr) {
return true; // default case
},
raw_allocator_);
default_allocator_.reset(cond_allocator);
}
~ChunkedAllocator() override {
// Specify destruct order.
default_allocator_.reset();
chunks_.clear();
raw_allocator_.reset();
}
std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
auto* allocation = chunks_.back().get();
std::unique_ptr<Allocator> allocator(new LockedAllocator(
std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
if (retry_time_ > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
}
bool IsAllocThreadSafe() const override { return true; }
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
return default_allocator_->Allocate(size, attr).release();
}
protected:
size_t max_chunk_size_;
int64_t retry_time_;
std::vector<AllocationPtr> chunks_;
std::shared_ptr<Allocator> raw_allocator_;
std::shared_ptr<Allocator> default_allocator_;
};
#ifdef PADDLE_WITH_CUDA
class CUDAChunkedAllocator : public ChunkedAllocator {
public:
explicit CUDAChunkedAllocator(int dev_id)
: ChunkedAllocator(std::unique_ptr<Allocator>(
new CUDAAllocator(platform::CUDAPlace(dev_id))),
GetMaxChunkSize(dev_id), GetCapcity(dev_id),
GetRetryTime()) {}
private:
static size_t GetMaxChunkSize(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
return platform::GpuMaxChunkSize();
}
static size_t GetCapcity(int dev_id) {
platform::CUDADeviceGuard guard(dev_id);
size_t available, total;
platform::GpuMemoryUsage(&available, &total);
size_t max_chunk_size = platform::GpuMaxChunkSize();
return max_chunk_size == 0 ? 0 : available / max_chunk_size;
}
static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
};
class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
public:
CUDAPinnedChunkedAllocator()
: ChunkedAllocator(std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
platform::CUDAPinnedMaxChunkSize(), GetCapacity(),
-1) {} // never retry
private:
static size_t GetCapacity() {
size_t total = platform::CpuTotalPhysicalMemory();
size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
return max_chunk_size == 0 ? 0 : total / max_chunk_size;
}
};
#endif
class AllocatorFacadePrivate {
public:
std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() {
if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
InitLegacyAllocator();
} else {
InitCPUAllocator();
InitCUDAAllocator();
InitCUDAPinnedAllocator();
WrapZeroSizeAllocator();
}
}
private:
void InitLegacyAllocator() {
std::vector<platform::Place> places{platform::CPUPlace()};
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
places.emplace_back(platform::CUDAPlace(dev_id));
}
places.emplace_back(platform::CUDAPinnedPlace());
#endif
for (auto& p : places) {
allocators_[p] = std::make_shared<LegacyAllocator>(p);
}
}
void InitCPUAllocator() {
allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
}
void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<CUDAChunkedAllocator>(dev_id);
}
#endif
}
void InitCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CUDAPinnedChunkedAllocator>();
#endif
}
void WrapZeroSizeAllocator() {
for (auto& pair : allocators_) {
pair.second =
std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
}
}
};
// Pimpl. Make interface clean.
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() { delete m_; }
AllocatorFacade& AllocatorFacade::Instance() {
static AllocatorFacade instance;
return instance;
}
std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
AllocationDeleter());
}
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr) {
auto it = m_->allocators_.find(place);
if (it == m_->allocators_.end()) {
throw BadAlloc(
string::Sprintf("No such allocator for the place, %s", place));
}
return m_->allocators_.at(place)->Allocate(size, attr);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
// Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should
// be hidden behind this facade.
//
// NOTE(yy): This class is a singleton class.
// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
// a Pimpl trick;
class AllocatorFacadePrivate;
class AllocatorFacade {
public:
~AllocatorFacade();
AllocatorFacade(const AllocatorFacade& o) = delete;
const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
static AllocatorFacade& Instance();
// Allocate a shared allocation.
std::shared_ptr<Allocation> AllocShared(
const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
// Allocate a unique allocation.
AllocationPtr Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
AllocatorFacadePrivate* m_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "gflags/gflags.h"
DEFINE_string(
allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid."
"New means the experimental allocators of Fluid. in [legacy, new]");
namespace paddle {
namespace memory {
namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() {
return FLAGS_allocator_strategy == "legacy"
? AllocatorStrategy::kLegacy
: AllocatorStrategy::kNaiveBestFit;
}
AllocatorStrategy GetAllocatorStrategy() {
static AllocatorStrategy strategy = GetStrategyFromFlag();
return strategy;
}
void UseAllocatorStrategyGFlag() {}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace memory {
namespace allocation {
enum class AllocatorStrategy { kLegacy, kNaiveBestFit };
extern AllocatorStrategy GetAllocatorStrategy();
// Do nothing, just make sure linker do not prune this file.
extern void UseAllocatorStrategyGFlag();
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
bool AutoIncrementAllocator::IsAllocThreadSafe() const { return true; }
std::shared_ptr<Allocator> AutoIncrementAllocator::CreateNewAllocator() {
std::lock_guard<std::mutex> guard(mtx_);
auto old_size = allocator_num_.load();
PADDLE_ENFORCE_LT(old_size, underlying_allocators_.size(),
"Allocator number exceeds capacity %d",
underlying_allocators_.size());
underlying_allocators_[old_size] = creator_();
prev_success_allocator_ = old_size;
++allocator_num_;
PADDLE_ENFORCE(
underlying_allocators_[old_size]->IsAllocThreadSafe(),
"the underlying allocator must be thread safe. This is a program "
"bug.");
return underlying_allocators_[old_size];
}
Allocation *AutoIncrementAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) {
auto cur = prev_success_allocator_.load();
size_t retry_count = allocator_num_.load();
size_t allocator_num = retry_count;
while (retry_count-- > 0) { // until there retry count is zero
try {
auto res = underlying_allocators_[cur]->Allocate(size, attr);
prev_success_allocator_ = cur;
return res.release();
} catch (BadAlloc &) {
if (++cur >= allocator_num) {
cur = 0;
}
} catch (...) {
// if there is another type of allocation, just rethrow it.
throw;
}
}
// This happens when the first allocator is exhausted and
// there are more than 1 allocation requests
// In this situation, the first allocation request would success
// and the second allocation request would fail if we do not use
// the newly created allocator by the first allocation request.
for (cur = allocator_num; cur < allocator_num_; ++cur) {
try {
auto ret = underlying_allocators_[cur]->Allocate(size, attr);
prev_success_allocator_ = cur;
return ret.release();
} catch (BadAlloc &) {
} catch (...) {
throw;
}
}
// No suitable allocator
return CreateNewAllocator()->Allocate(size, attr).release();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic> // NOLINT
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// The AutoIncrementAllocator manages many underlying allocators. If none of
// them can allocate the request memory, a new allocator will be created and
// invoke its `allocate` method.
//
// NOTE(yy): The AutoIncrementAllocator will prefer to allocate memory from
// the latest successful allocator.
//
// NOTE(yy): We may need to release an underlying allocator if it allocate
// nothing. However, it is generally not useful, since it will make performance
// undetermined.
//
// NOTE(yy): This allocator is only locked when creating new underlying
// allocator. The allocation requests from many threads may be dispatched
// to the same underlying allocator. So the underlying allocator must be
// thread safe.
//
// NOTE(zjl): Add capacity parameters to constructor. A high-performance
// thread-safe std::vector with varying size is hard to implement.
// Fortunately, we can get the total GPU memory and each chunk size.
// Therefore, we can get the suitable capacity of AutoIncrementAllocator.
class AutoIncrementAllocator : public Allocator {
public:
// Creator is the method to create ManagedAllocator
using AllocatorCreator = std::function<std::shared_ptr<Allocator>()>;
explicit AutoIncrementAllocator(AllocatorCreator&& creator, size_t capacity)
: creator_(std::move(creator)), underlying_allocators_(capacity) {}
bool IsAllocThreadSafe() const override;
private:
std::shared_ptr<Allocator> CreateNewAllocator();
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
AllocatorCreator creator_;
std::vector<AllocatorCreator::result_type> underlying_allocators_;
std::atomic<size_t> allocator_num_{0};
// Use std::atomic rather than std::mutex, since std::atomic is usually
// lock-free
std::atomic<size_t> prev_success_allocator_{0};
std::mutex mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include <cmath>
#include <list>
#include <map>
#include <string>
namespace paddle {
namespace memory {
namespace allocation {
static int HighestBitPos(size_t N) {
if (UNLIKELY(N == 0)) {
return 0;
} else {
#ifdef __GNUCC__
return sizeof(unsigned int) * 8 - __builtin_clz(N);
#else
return static_cast<int>(std::log2(N) + 1);
#endif
}
}
BestFitAllocator::BestFitAllocator(Allocation* allocation)
: allocation_(allocation) {
details::Chunk chunk;
chunk.size_ = allocation_->size();
chunk.offset_ = 0;
chunk.is_free = true;
chunks_.emplace_back(chunk);
free_chunks_[HighestBitPos(chunk.size_)].insert(
{chunk.size_, chunks_.begin()});
}
size_t BestFitAllocator::FreeSize() const {
size_t acc = 0;
for (auto& array_item : free_chunks_) {
for (auto& pair : array_item) {
acc += pair.second->size_;
}
}
return acc;
}
BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
size_t free_chunk_offset,
MapIt bin_iterator) {
auto to_split_it = bin_iterator->second;
free_chunks_[free_chunk_offset].erase(bin_iterator);
PADDLE_ENFORCE(to_split_it->is_free);
PADDLE_ENFORCE_GE(to_split_it->size_, request_size);
auto remaining_size = to_split_it->size_ - request_size;
details::Chunk to_use;
details::Chunk remaining;
to_use.size_ = request_size;
to_use.is_free = false;
remaining.size_ = remaining_size;
remaining.is_free = true;
// calc offsets
to_use.offset_ = to_split_it->offset_;
remaining.offset_ = to_use.offset_ + to_use.size_;
// insert to chunk list
auto to_use_it = chunks_.insert(to_split_it, to_use);
if (remaining.size_ != 0) {
auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
free_chunks_[bit_size].insert(
{remaining.size_, chunks_.insert(to_split_it, remaining)});
}
chunks_.erase(to_split_it);
return to_use_it;
}
void BestFitAllocator::InsertFreeNode(const ListIt& it) {
auto pos = static_cast<size_t>(HighestBitPos(it->size_));
auto& free_map = free_chunks_[pos];
free_map.insert({it->size_, it});
}
void BestFitAllocator::EraseFreeNode(const ListIt& it) {
size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
auto& free_map = free_chunks_[pos];
auto map_it = free_map.find(it->size_);
while (map_it->second != it && map_it != free_map.end()) {
++map_it;
}
PADDLE_ENFORCE(map_it != free_map.end());
free_map.erase(map_it);
}
size_t BestFitAllocator::NumFreeChunks() const {
size_t num = 0;
for (auto& array_item : free_chunks_) {
num += array_item.size();
}
return num;
}
void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
auto chunk_it = bf_allocation->ChunkIterator();
PADDLE_ENFORCE(!chunk_it->is_free);
chunk_it->is_free = true;
if (chunk_it != chunks_.begin()) {
auto prev_it = chunk_it;
--prev_it;
if (prev_it->is_free) {
// Merge Left.
EraseFreeNode(prev_it);
prev_it->size_ += chunk_it->size_;
chunks_.erase(chunk_it);
chunk_it = prev_it;
}
}
auto next_it = chunk_it;
++next_it;
if (next_it != chunks_.end() && next_it->is_free) {
EraseFreeNode(next_it);
chunk_it->size_ += next_it->size_;
chunks_.erase(next_it);
}
InsertFreeNode(chunk_it);
delete allocation;
}
Allocation* BestFitAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
MapIt map_it;
for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
map_it = free_chunks_[highest_set_bit].lower_bound(size);
if (map_it != free_chunks_[highest_set_bit].end()) {
break;
}
}
if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d, All fragments size is %d", size, FreeSize()));
}
auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
return new BestFitAllocation(this, chunk_it);
}
BestFitAllocation::BestFitAllocation(
paddle::memory::allocation::BestFitAllocator* allocator,
typename details::ChunkList::iterator chunk_it)
: Allocation(reinterpret_cast<void*>(
reinterpret_cast<uintptr_t>(allocator->BasePtr()) +
chunk_it->offset_),
chunk_it->size_, allocator->Place()),
chunk_it_(chunk_it) {}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <list>
#include <map>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
namespace details {
struct Chunk {
bool is_free{true};
// Offset to the base allocation.
uintptr_t offset_;
size_t size_;
};
// Here we use std::list to maintain chunk list.
// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
// `Payload`. Such as
// *-------*---------------*---------------*--------------*
// | Chunk | prev_ pointer | next_ pointer | payload .... |
// *-------*---------------*---------------*--------------*
// This implementation can just return a raw pointer, and we can get the list
// structure by the raw pointer. However, we cannot use the same code on GPU
// since CPU cannot access GPU memory directly.
//
// So we choose to use `std::list` and return an allocation instance, which
// contains the list node iterator, then we can unify CPU/GPU code.
//
// To return an allocation is not a bad idea, since Tensor/Vector should holds
// an allocation instead of raw pointer directly.
using ChunkList = std::list<Chunk>;
// Here we use a multi-level map of free chunks.
// the map is
// MSB offset --> size --> [ChunkList::iterator]
//
// The time complexities:
// find a free chunk:
// O(logN),
// where N is the number of free nodes with the same MSB offset.
// find the position of a chunk iterator:
// O(logN + K),
// where N is the number of free nodes with the same MSB offset.
// where K is the number of free nodes with the same size.
// insert a free chunk:
// O(logN),
// where N is the number of free nodes with the same MSB offset.
// erase a free chunk:
// O(1)
using FreeChunkBin =
std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
} // namespace details
class BestFitAllocator;
// The BestFitAllocation maintain the List Node iterator.
class BestFitAllocation : public Allocation {
private:
using ListIt = typename details::ChunkList::iterator;
public:
BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);
const ListIt& ChunkIterator() const { return chunk_it_; }
private:
typename details::ChunkList::iterator chunk_it_;
};
// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
// safe, we must wrap a locked_allocator. However, we can implement a thread
// safe allocator by locking each bin and chunks list independently. It will
// make BestFitAllocator faster in multi-thread situation.
//
// This allocator implements a best-fit allocator with merging the free nodes.
//
// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
// is larger than request size, the original block will be split into two
// chunks. The first block will be used and the second block will be put into
// free chunks.
//
// To free an allocation, it will set the chunk of allocation to free and merge
// the prev-chunk and the next-chunk when possible.
class BestFitAllocator : public Allocator {
public:
explicit BestFitAllocator(Allocation* allocation);
void* BasePtr() const { return allocation_->ptr(); }
const platform::Place& Place() const { return allocation_->place(); }
size_t NumFreeChunks() const;
private:
size_t FreeSize() const;
using MapIt = typename details::FreeChunkBin::value_type::iterator;
using ListIt = typename details::ChunkList::iterator;
ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
MapIt bin_iterator);
void EraseFreeNode(const ListIt& it);
void InsertFreeNode(const ListIt& it);
protected:
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
Allocation* allocation_; // not owned
details::ChunkList chunks_;
details::FreeChunkBin free_chunks_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class StubAllocation : public Allocation {
public:
explicit StubAllocation(size_t size)
: Allocation(0, size, platform::CPUPlace()) {}
};
TEST(BestFitAllocator, test_allocation) {
StubAllocation stub(4UL * 1024 * 1024 * 1024);
BestFitAllocator allocator(&stub);
{ auto allocation = allocator.Allocate(64, allocator.kDefault); }
{
auto allocation = allocator.Allocate(80, allocator.kDefault);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation.get());
ASSERT_NE(best_fit_allocation, nullptr);
ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free);
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
ASSERT_EQ(allocation->size(), 80);
ASSERT_EQ(allocation->ptr(), nullptr);
}
auto allocation2 = allocator.Allocate(60, allocator.kDefault);
auto allocation3 = allocator.Allocate(90, allocator.kDefault);
allocation2.reset();
allocation2 = allocator.Allocate(30, allocator.kDefault);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation2.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
}
allocation2.reset();
allocation2 = allocator.Allocate(60, allocator.kDefault);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation2.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
}
allocation.reset();
allocation2.reset();
allocation = allocator.Allocate(80 + 60, allocator.kDefault);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
}
allocation.reset();
allocation = allocator.Allocate(80, allocator.kDefault);
allocation2 = allocator.Allocate(60, allocator.kDefault);
allocation = nullptr;
allocation2 = nullptr;
allocation3 = nullptr;
ASSERT_EQ(allocator.NumFreeChunks(), 1U);
}
}
TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
CPUAllocator allocator;
auto global_allocation =
allocator.Allocate(256UL * 1024 * 1024, allocator.kDefault);
std::unique_ptr<Allocator> best_fit_allocator(
new BestFitAllocator(global_allocation.get()));
LockedAllocator locked_allocator(std::move(best_fit_allocator));
auto th_main = [&] {
std::random_device dev;
std::default_random_engine engine(dev());
std::uniform_int_distribution<size_t> dist(1U, 1024U);
for (size_t i = 0; i < 128; ++i) {
size_t allocate_size = dist(engine);
auto allocation = locked_allocator.Allocate(
sizeof(size_t) * allocate_size, locked_allocator.kDefault);
size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
for (size_t j = 0; j < allocate_size; ++j) {
data[j] = j;
}
std::this_thread::yield();
for (size_t j = 0; j < allocate_size; ++j) {
ASSERT_EQ(data[j], j);
}
}
};
{
std::vector<std::thread> threads;
for (size_t i = 0; i < 1024; ++i) {
threads.emplace_back(th_main);
}
for (auto& th : threads) {
th.join();
}
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace memory {
namespace allocation {
struct ForEachFill {
size_t* ptr_;
explicit ForEachFill(size_t* ptr) : ptr_(ptr) {}
__device__ void operator()(size_t i) { ptr_[i] = i; }
};
TEST(BestFitAllocator, concurrent_cuda) {
CUDAAllocator allocator(platform::CUDAPlace(0));
// 256 MB
auto cuda_allocation =
allocator.Allocate(256U * 1024 * 1024, allocator.kDefault);
LockedAllocator concurrent_allocator(
std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
auto th_main = [&] {
std::random_device dev;
std::default_random_engine engine(dev());
std::uniform_int_distribution<size_t> dist(1U, 1024U);
platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
std::array<size_t, 1024> buf;
for (size_t i = 0; i < 128; ++i) {
size_t allocate_size = dist(engine);
auto allocation = concurrent_allocator.Allocate(
sizeof(size_t) * allocate_size, concurrent_allocator.kDefault);
size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
ForEachFill fill(data);
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
allocate_size);
for_range(fill);
memory::Copy(platform::CPUPlace(), buf.data(), gpu, data,
sizeof(size_t) * allocate_size, dev_ctx.stream());
dev_ctx.Wait();
for (size_t j = 0; j < allocate_size; ++j) {
ASSERT_EQ(buf[j], j);
}
allocation = nullptr;
}
};
{
std::vector<std::thread> threads;
for (size_t i = 0; i < 1024; ++i) {
threads.emplace_back(th_main);
}
for (auto& th : threads) {
th.join();
}
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <algorithm>
#include <limits>
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
namespace paddle {
namespace memory {
namespace allocation {
BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
: underlying_allocator_(std::move(allocator)) {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_,
"Underlying allocator of BufferedAllocator must be unmanaged");
if (underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
}
BufferedAllocator::~BufferedAllocator() { FreeCache(-1UL); }
void BufferedAllocator::FreeCache(size_t size) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
if (UNLIKELY(size == 0)) return;
size_t cur = 0;
while (!allocations_.empty()) { // free the largest
auto it = --allocations_.end();
cur += it->second->size();
delete it->second.release();
allocations_.erase(it);
if (cur >= size) return;
}
}
bool BufferedAllocator::IsAllocThreadSafe() const {
return this->underlying_allocator_->IsAllocThreadSafe();
}
void BufferedAllocator::Free(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
allocations_.emplace(allocation->size(), AllocationPtr(allocation));
}
Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
auto it = allocations_.lower_bound(size);
if (it != allocations_.end() && it->first < size * 2) {
AllocationPtr result(std::move(it->second));
allocations_.erase(it);
return new AllocationWithUnderlying(std::move(result));
}
}
try {
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
} catch (BadAlloc &) {
FreeCache(size);
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
namespace memory {
namespace allocation {
// NOTE(zjl): BufferedAllocator maintains a memory pool to accelerate
// memory allocation and reuse memory.
// BufferedAllocator provides the same thread-safety level as
// underlying_allocator_
class BufferedAllocator : public Allocator {
public:
explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
~BufferedAllocator();
bool IsAllocThreadSafe() const override;
// only used in unittest
inline void ClearCache() { FreeCache(-1UL); }
private:
void FreeCache(size_t size);
protected:
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::unique_ptr<Allocator> underlying_allocator_;
std::multimap<size_t, AllocationPtr> allocations_;
std::unique_ptr<std::mutex> mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
Allocation *allocation, bool thread_safe) {
std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
if (thread_safe) {
allocator.reset(new LockedAllocator(std::move(allocator)));
}
return std::unique_ptr<BufferedAllocator>(
new BufferedAllocator(std::move(allocator)));
}
TEST(buffered_allocator, thread_safety) {
std::unique_ptr<CPUAllocator> allocator(new CPUAllocator());
auto chunk = allocator->Allocate(1 << 20, allocator->kDefault);
{
auto buf_allocator = GetBufferedAllocator(chunk.get(), true);
ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true);
}
{
auto buf_allocator = GetBufferedAllocator(chunk.get(), false);
ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), false);
}
}
class StubAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class StubAllocator : public Allocator {
public:
void ResetCounter() {
construct_count_ = 0;
destruct_count_ = 0;
}
size_t GetAllocCount() const { return construct_count_; }
size_t GetFreeCount() const { return destruct_count_; }
protected:
void Free(Allocation *allocation) override {
auto *alloc = dynamic_cast<StubAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(alloc);
if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
++destruct_count_;
delete allocation;
}
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override {
++construct_count_;
if (size == 0) {
return new StubAllocation(nullptr, 0, platform::CPUPlace());
} else {
return new StubAllocation(new uint8_t[size], size, platform::CPUPlace());
}
}
private:
size_t construct_count_ = 0;
size_t destruct_count_ = 0;
};
constexpr size_t kZero = 0;
constexpr size_t kOne = 1;
constexpr size_t kTwo = 2;
TEST(buffered_allocator, lazy_free) {
std::unique_ptr<StubAllocator> stub_allocator(new StubAllocator());
auto *underlying_allocator = stub_allocator.get();
std::unique_ptr<BufferedAllocator> allocator(
new BufferedAllocator(std::move(stub_allocator)));
{
underlying_allocator->ResetCounter();
auto x = allocator->Allocate(1025, allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
x = nullptr;
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
{
underlying_allocator->ResetCounter();
auto x = allocator->Allocate(900, allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
auto y = allocator->Allocate(2048, allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
x = nullptr;
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
y = nullptr;
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
{
underlying_allocator->ResetCounter();
allocator->ClearCache();
ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
}
}
TEST(buffered_allocator, garbage_collection) {
std::unique_ptr<CPUAllocator> cpu_allocator(new CPUAllocator());
auto chunk = cpu_allocator->Allocate(2048, cpu_allocator->kDefault);
auto allocator = GetBufferedAllocator(chunk.get(), false);
auto x1 = allocator->Allocate(1600, allocator->kDefault);
auto x2 = allocator->Allocate(400, allocator->kDefault);
x1 = nullptr;
x2 = nullptr;
auto x3 = allocator->Allocate(1600, allocator->kDefault);
ASSERT_NE(x3, nullptr);
ASSERT_NE(x3->ptr(), nullptr);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/conditional_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
ConditionalAllocator& ConditionalAllocator::AddAllocator(
std::function<bool(size_t, Allocator::Attr)> func,
std::shared_ptr<Allocator> allocator) {
underlying_allocators_.emplace_back(std::move(func), std::move(allocator));
return *this;
}
bool ConditionalAllocator::IsAllocThreadSafe() const {
return std::all_of(underlying_allocators_.begin(),
underlying_allocators_.end(),
[](const AllocatorWithCond& allocatorWithCond) {
return allocatorWithCond.second->IsAllocThreadSafe();
});
}
Allocation* ConditionalAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) {
for (auto& pair : underlying_allocators_) {
if (pair.first(size, attr)) {
return pair.second->Allocate(size, attr).release();
}
}
throw BadAlloc("No suitable allocator");
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// A composite allocator who will dispatch the allocation request by registered
// condition.
//
// For example:
//
// auto* cond_allocator = new ConditionalAllocator();
// cond_allocator->AddAllocator([](size_t size, Attr attr){
// // if size > 10
// return size > 10;
// }, allocator_a).AddAllocator([](size_t size, Attr attr){
// // elif attr is kDefault
// return attr == kDefault;
// }, allocator_b).AddAllocator([](size_t size, Attr attr){
// // else
// return true;
// }, allocator_c);
class ConditionalAllocator : public Allocator {
public:
ConditionalAllocator() = default;
ConditionalAllocator& AddAllocator(std::function<bool(size_t, Attr)> func,
std::shared_ptr<Allocator> allocator);
bool IsAllocThreadSafe() const override;
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
using AllocatorWithCond =
std::pair<std::function<bool(size_t, Attr)>, std::shared_ptr<Allocator>>;
std::vector<AllocatorWithCond> underlying_allocators_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include <stdlib.h>
#include <string>
namespace paddle {
namespace memory {
namespace allocation {
CPUAllocation::CPUAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
bool CPUAllocator::IsAllocThreadSafe() const { return true; }
void CPUAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
free(allocation->ptr());
delete allocation;
}
Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr;
auto status = posix_memalign(&ptr, kAlignment, size);
if (UNLIKELY(status) != 0) {
throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
size, status));
}
return new CPUAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// CPU system allocator and allocation.
//
// NOTE(yy): Should we just use `malloc` here since there is an
// aligned_allocator.
//
// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
// an open-sourced allocator into Paddle.
class CPUAllocator;
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size);
};
class CPUAllocator : public Allocator {
public:
constexpr static size_t kAlignment = 64u;
bool IsAllocThreadSafe() const override;
protected:
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
void CUDAAllocator::Free(Allocation* allocation) {
platform::CUDADeviceGuard guard(place_.device);
auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
place_);
PADDLE_ENFORCE(cudaFree(allocation->ptr()));
delete allocation;
}
Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::CUDADeviceGuard guard(place_.device);
void* ptr;
auto status = cudaMalloc(&ptr, size);
if (UNLIKELY(status != cudaSuccess)) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status)));
}
return new CUDAAllocation(ptr, size, platform::Place(place_));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
// CUDA System allocator and allocation.
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class CUDAAllocator : public Allocator {
public:
explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
explicit CUDAAllocator(const platform::Place& place)
: place_(boost::get<platform::CUDAPlace>(place)) {}
bool IsAllocThreadSafe() const override;
protected:
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
platform::CUDAPlace place_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
namespace memory {
namespace legacy {
template <typename Place>
void *Alloc(const Place &place, size_t size);
template <typename Place>
void Free(const Place &place, void *p);
template <typename Place>
size_t Used(const Place &place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace &cpu) const;
size_t operator()(const platform::CUDAPlace &gpu) const;
size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const;
};
size_t memory_usage(const platform::Place &p);
using BuddyAllocator = detail::BuddyAllocator;
BuddyAllocator *GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
static std::once_flag init_flag;
static detail::BuddyAllocator *a = nullptr;
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
});
return a;
}
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
void *Alloc(size_t size) { return malloc(size); }
void Free(void *p) {
PADDLE_ENFORCE(p);
free(p);
}
static NaiveAllocator *Instance() {
static NaiveAllocator x;
return &x;
}
private:
std::mutex lock_;
};
template <>
void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void *p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(100) << " pointer=" << p;
return p;
}
template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr;
std::call_once(init_flag, [gpu_id]() {
int gpu_num = platform::GetCUDADeviceCount();
PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
gpu_num);
a_arr = new BuddyAllocator *[gpu_num];
for (int i = 0; i < gpu_num; i++) {
a_arr[i] = nullptr;
platform::SetDeviceId(i);
a_arr[i] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(100) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
});
platform::SetDeviceId(gpu_id);
return a_arr[gpu_id];
}
#endif
template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA
return GetGPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
template <>
void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail);
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMinChunkSize());
LOG(WARNING) << "GpuMaxChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator *ba = nullptr;
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
});
return ba;
}
#endif
template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA
return GetCUDAPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
template <>
void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
void *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p) {
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Free(p);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
struct AllocVisitor : public boost::static_visitor<void *> {
inline explicit AllocVisitor(size_t size) : size_(size) {}
template <typename Place>
inline void *operator()(const Place &place) const {
return Alloc<Place>(place, size_);
}
private:
size_t size_;
};
struct FreeVisitor : public boost::static_visitor<void> {
inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {}
template <typename Place>
inline void operator()(const Place &place) const {
Free<Place>(place, ptr_);
}
private:
void *ptr_;
};
size_t Usage::operator()(const platform::CPUPlace &cpu) const {
return Used(cpu);
}
size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
#ifdef PADDLE_WITH_CUDA
return Used(gpu);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
return Used(cuda_pinned);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
} // namespace legacy
namespace allocation {
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_);
}
void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()),
allocation->place());
delete allocation;
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class LegacyAllocatorPrivate;
class LegacyAllocator : public Allocator {
public:
explicit LegacyAllocator(const platform::Place &p) : place_(p) {}
protected:
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
void Free(Allocation *allocation) override;
private:
platform::Place place_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
namespace memory {
namespace allocation {
bool LockedAllocator::IsAllocThreadSafe() const { return true; }
LockedAllocator::LockedAllocator(
std::unique_ptr<Allocator> &&underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) {
PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
if (!underlying_allocator_->IsAllocThreadSafe()) {
mtx_.reset(new std::mutex());
}
}
void LockedAllocator::Free(Allocation *allocation) {
{
platform::LockGuardPtr<std::mutex> guard(mtx_);
reinterpret_cast<AllocationWithUnderlying *>(allocation)
->allocation_.reset(); // Destroy inner allocation
}
delete allocation;
}
Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
platform::LockGuardPtr<std::mutex> guard(mtx_);
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// A allocator to make underlying allocator thread safe.
class LockedAllocator : public Allocator {
public:
explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
bool IsAllocThreadSafe() const override;
protected:
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::unique_ptr<Allocator> underlying_allocator_;
std::unique_ptr<std::mutex> mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include <cuda.h>
#include <cuda_runtime.h>
namespace paddle {
namespace memory {
namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::Free(Allocation *allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
delete allocation;
}
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
Allocator::Attr attr) {
// PADDLE_ENFORCE_EQ(
// attr, kCrossDevice,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void *ptr;
PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
return new CPUPinnedAllocation(ptr, size);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// Allocator uses `cudaMallocHost`
class CPUPinnedAllocation : public Allocation {
public:
CPUPinnedAllocation(void *ptr, size_t size)
: Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
};
class CPUPinnedAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override;
protected:
void Free(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
namespace paddle {
namespace memory {
namespace allocation {
bool RetryAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
void RetryAllocator::Free(Allocation* allocation) {
// Delete underlying allocation first.
reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
{
// notify all waited allocators, they can try to allocate memory after free.
std::lock_guard<std::mutex> lock(mutex_);
cv_.notify_all();
}
delete allocation;
}
Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
auto alloc_func = [&, this]() {
return new AllocationWithUnderlying(
underlying_allocator_->Allocate(size, attr));
};
// In fact, we can unify the code of allocation success and failure
// But it would add lock even when allocation success at the first time
try {
return alloc_func();
} catch (BadAlloc& bad_alloc) {
{
// We can just write allocation retry inside the predicate function of
// wait_until
// But it needs to acquire the lock when executing predicate function
// For better performance, we use loop here
auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
auto wait_until = [&, this] {
std::unique_lock<std::mutex> lock(mutex_);
return cv_.wait_until(lock, end_time);
};
while (wait_until() != std::cv_status::timeout) {
try {
return alloc_func();
} catch (BadAlloc& ex) {
bad_alloc = ex;
} catch (...) {
throw;
}
}
throw; // rethrow the original exception or throw the internal bad_alloc
}
} catch (...) {
throw;
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class RetryAllocator;
class RetryAllocator : public Allocator {
public:
RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
: underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
EnforceCheck();
}
bool IsAllocThreadSafe() const override;
private:
void EnforceCheck() {
PADDLE_ENFORCE_NOT_NULL(
underlying_allocator_.get(),
"UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
"UnderlyingAllocator of RetryAllocator must be thread-safe");
}
protected:
void Free(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::unique_ptr<Allocator> underlying_allocator_;
std::chrono::milliseconds retry_time_;
std::mutex mutex_;
std::condition_variable cv_;
// For debug, We can add an atomic integer to record how many memory sizes are
// waited to allocate
// std::atomic<size_t> waited_allocate_size_{0};
friend class RetryAllocation;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
TEST(RetryAllocator, RetryAllocator) {
CPUAllocator cpu_allocator;
size_t size = (1 << 20);
auto cpu_allocation = cpu_allocator.Allocate(size, cpu_allocator.kDefault);
std::unique_ptr<BestFitAllocator> best_fit_allocator(
new BestFitAllocator(cpu_allocation.get()));
std::unique_ptr<LockedAllocator> locked_allocator(
new LockedAllocator(std::move(best_fit_allocator)));
size_t thread_num = 32;
size_t sleep_time = 40;
size_t extra_time = 2;
// Reserve to perform more tests in the future
std::vector<std::shared_ptr<Allocator>> allocators;
{
std::unique_ptr<BestFitAllocator> best_fit_allocator(
new BestFitAllocator(cpu_allocation.get()));
std::unique_ptr<LockedAllocator> locked_allocator(
new LockedAllocator(std::move(best_fit_allocator)));
allocators.push_back(std::make_shared<RetryAllocator>(
std::move(locked_allocator),
(thread_num - 1) * (sleep_time + extra_time)));
}
for (auto &allocator : allocators) {
std::vector<std::thread> threads(thread_num);
std::vector<void *> addresses(threads.size(), nullptr);
std::mutex mutex;
std::condition_variable cv;
bool flag = false;
for (size_t i = 0; i < threads.size(); ++i) {
threads[i] = std::thread([&, i]() {
{
std::unique_lock<std::mutex> lock(mutex);
cv.wait(lock, [&] { return flag; });
}
auto ret = allocator->Allocate(size - 1);
addresses[i] = ret->ptr();
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time));
});
}
{
std::lock_guard<std::mutex> lock(mutex);
flag = true;
cv.notify_all();
}
for (auto &th : threads) {
th.join();
}
void *val = cpu_allocation->ptr();
bool is_all_equal = std::all_of(addresses.begin(), addresses.end(),
[val](void *p) { return p == val; });
ASSERT_TRUE(is_all_equal);
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
bool ZeroSizeAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new ZeroSizeAllocation(place_);
} else {
return underlying_allocator_->Allocate(size, attr).release();
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator {
public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
const platform::Place& p)
: underlying_allocator_(std::move(underlying_allocator)), place_(p) {}
bool IsAllocThreadSafe() const override;
protected:
Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
private:
std::shared_ptr<Allocator> underlying_allocator_;
const platform::Place& place_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -30,12 +30,7 @@ limitations under the License. */ ...@@ -30,12 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which DECLARE_bool(use_pinned_memory);
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -12,221 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,221 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/platform/place.h"
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
using BuddyAllocator = detail::BuddyAllocator; size_t size, Allocator::Attr attr) {
return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr);
BuddyAllocator* GetCPUBuddyAllocator() {
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
static std::once_flag init_flag;
static detail::BuddyAllocator* a = nullptr;
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
});
return a;
}
// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
// seems they are almost the same overhead.
struct NaiveAllocator {
void* Alloc(size_t size) { return malloc(size); }
void Free(void* p) {
PADDLE_ENFORCE(p);
free(p);
}
static NaiveAllocator* Instance() {
static NaiveAllocator x;
return &x;
}
private:
std::mutex lock_;
};
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(100) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(100) << " pointer=" << p;
return p;
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
VLOG(100) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used();
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator** a_arr = nullptr;
std::call_once(init_flag, [gpu_id]() {
int gpu_num = platform::GetCUDADeviceCount();
PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
gpu_num);
a_arr = new BuddyAllocator*[gpu_num];
for (int i = 0; i < gpu_num; i++) {
a_arr[i] = nullptr;
platform::SetDeviceId(i);
a_arr[i] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(100) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
});
platform::SetDeviceId(gpu_id);
return a_arr[gpu_id];
}
template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size)
<< " in GPU " << place.device << ", available "
<< string::HumanReadableSize(avail);
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMinChunkSize());
LOG(WARNING) << "GpuMaxChunkSize "
<< string::HumanReadableSize(
buddy_allocator->GetMaxChunkSize());
LOG(WARNING) << "GPU memory used: "
<< string::HumanReadableSize(Used<platform::CUDAPlace>(place));
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator* ba = nullptr;
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
});
return ba;
}
template <>
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
return GetCUDAPinnedBuddyAllocator()->Used();
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
size_t size) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
}
template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
GetCUDAPinnedBuddyAllocator()->Free(p);
}
#endif
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
return Used(cpu);
}
size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
return Used(gpu);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
return Used(cuda_pinned);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
} }
size_t memory_usage(const platform::Place& p) { AllocationPtr Alloc(const platform::Place& place, size_t size,
return boost::apply_visitor(Usage(), p); Allocator::Attr attr) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
} }
} // namespace memory } // namespace memory
......
...@@ -14,91 +14,21 @@ limitations under the License. */ ...@@ -14,91 +14,21 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
using allocation::Allocation;
using allocation::Allocator;
using allocation::AllocationPtr;
/** extern std::shared_ptr<Allocation> AllocShared(
* \brief Allocate memory block in one place. const platform::Place& place, size_t size,
* Allocator::Attr attr = Allocator::kDefault);
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private: extern AllocationPtr Alloc(const platform::Place& place, size_t size,
Place place_; Allocator::Attr attr = Allocator::kDefault);
};
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
inline bool is_aligned(void const *p) {
return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
}
size_t align(size_t size, paddle::platform::CPUPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CPUPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CPUMultAlloc) {
paddle::platform::CPUPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, GPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = gpu;
EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
paddle::memory::Free(gpu, p);
}
TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::CUDAPlace gpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(gpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(size, gpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(gpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(p.second, gpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
}
size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CUDAPinnedAllocator) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CUDAPinnedPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
paddle::platform::CUDAPinnedPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#endif
...@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
...@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
} }
} }
...@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
cudaStreamSynchronize(0);
}
} }
} }
......
...@@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory(tensorrt) add_subdirectory(tensorrt)
endif() endif()
register_operators(EXCLUDES warpctc_op) register_operators(EXCLUDES warpctc_op conv_fusion_op)
# warpctc_cudnn need cudnn 7 above # warpctc_cudnn need cudnn 7 above
if (WITH_GPU) if (WITH_GPU)
...@@ -43,6 +43,8 @@ if (WITH_GPU) ...@@ -43,6 +43,8 @@ if (WITH_GPU)
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
...@@ -70,7 +72,7 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) ...@@ -70,7 +72,7 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
......
...@@ -54,7 +54,8 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) { ...@@ -54,7 +54,8 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
} }
} }
TEST(beam_search_op, run) { // It seems that beam_search_op has bugs.
TEST(DISABLED_beam_search_op, run) {
CPUPlace place; CPUPlace place;
LoDTensor ids, scores; LoDTensor ids, scores;
CreateInput(&ids, &scores); CreateInput(&ids, &scores);
......
...@@ -43,26 +43,6 @@ using DataLayout = platform::DataLayout; ...@@ -43,26 +43,6 @@ using DataLayout = platform::DataLayout;
template <typename T> template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType; using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
#if CUDNN_VERSION_MIN(6, 0, 5)
static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#else
// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
#endif
template <typename T> template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -17,10 +17,31 @@ limitations under the License. */ ...@@ -17,10 +17,31 @@ limitations under the License. */
#include <functional> #include <functional>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
#if CUDNN_VERSION_MIN(6, 0, 5)
static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#else
// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
#endif
template <typename TAlgorithm> template <typename TAlgorithm>
class AlgorithmsCache { class AlgorithmsCache {
public: public:
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
// This fused conv follows the equation:
// y = act ( alpha1 * conv(x) + alpha2 * z + bias ).
// here, y is Output,
// x is Input,
// z is ResidualData,
// bias is Bias
class Conv2DFusionOpMaker : public Conv2DOpMaker {
protected:
void Apply() override {
AddAttr<std::string>(
"activation",
"The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
"'relux' , 'tanh', 'band_pass'")
.SetDefault("relu");
}
};
// TODO(qingqing): add gradient operator for conv2d_fusion
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
using DataLayout = platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
template <typename T>
class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* bias = ctx.Input<Tensor>("Bias");
PADDLE_ENFORCE(bias, "The bias should not be null.");
auto* residual = ctx.Input<Tensor>("ResidualData");
auto* output = ctx.Output<Tensor>("Output");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const std::string activation = ctx.Attr<std::string>("activation");
int groups = ctx.Attr<int>("groups");
int64_t user_workspace_size =
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
bool exhaustive_search =
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* bias_data = bias->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
const T* residual_data = residual ? residual->data<T>() : output_data;
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedTensorDescriptor bias_desc;
ScopedConvolutionDescriptor conv_desc;
ScopedActivationDescriptor act_desc;
DataLayout layout = DataLayout::kNCHW;
if (input->dims().size() == 5) {
layout = DataLayout::kNCDHW;
}
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
cudnn_conv_desc, groups));
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output->dims()));
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
// Now only support NCHW
std::vector<int> bias_dim = {1, static_cast<int>(output->dims()[1]), 1, 1};
cudnnTensorDescriptor_t cudnn_bias_desc =
bias_desc.descriptor<T>(layout, bias_dim);
cudnnActivationDescriptor_t cudnn_act_desc =
act_desc.descriptor<T>(activation);
// ------------------- cudnn conv workspace ---------------------
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
int64_t max_user_size =
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
user_workspace_size);
workspace_size_limit = max_user_size * 1024 * 1024;
}
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
if (activation == "identity") {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo;
} else {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
algo_cache =
ctx.scope()
.FindVar(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
} else {
algo_cache =
const_cast<framework::Scope&>(ctx.scope())
.Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = fwd_perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return fwd_perf_stat[0].algo;
});
VLOG(3) << "choose algo " << algo;
}
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
ops::CUDNNConvFusionOpKernel<double>);
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/framework/data_layout_transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -428,8 +428,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -428,8 +428,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
"same dimension sizes"); "same dimension sizes");
if (residual_param->format() != handler.GetDstFormat()) { if (residual_param->format() != handler.GetDstFormat()) {
auto output_data = auto output_data = output->mutable_data<T>(
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize()); ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
handler.GetDstMemorySize());
auto residual_data_tz = auto residual_data_tz =
paddle::framework::vectorize2int(residual_param->dims()); paddle::framework::vectorize2int(residual_param->dims());
auto residual_data_type = auto residual_data_type =
...@@ -449,8 +450,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -449,8 +450,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
} else { } else {
auto output_data = auto output_data = output->mutable_data<T>(
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize()); ctx.GetPlace(), paddle::memory::Allocator::kDefault,
handler.GetDstMemorySize());
dst_memory_p = dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
...@@ -692,7 +694,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -692,7 +694,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
user_diff_dst_memory_p, pipeline); user_diff_dst_memory_p, pipeline);
const size_t size = handler.GetDiffWeightsMemorySize(); const size_t size = handler.GetDiffWeightsMemorySize();
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size); filter_grad_data = filter_grad->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
auto diff_weights_memory_p = auto diff_weights_memory_p =
handler.AcquireDiffWeightsMemoryFromWeightsPrimitive( handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
...@@ -717,7 +720,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -717,7 +720,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pipeline); pipeline);
const size_t size = handler.GetDiffSourceMemorySize(); const size_t size = handler.GetDiffSourceMemorySize();
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size); input_grad_data = input_grad->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive( auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
reinterpret_cast<void*>(input_grad_data)); reinterpret_cast<void*>(input_grad_data));
......
...@@ -225,17 +225,9 @@ $$ ...@@ -225,17 +225,9 @@ $$
W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
$$ $$
)DOC"); )DOC");
Apply();
} }
class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{
{"Input", /*->*/ "Output"}};
}
};
void Conv3DOpMaker::Make() { void Conv3DOpMaker::Make() {
AddInput( AddInput(
"Input", "Input",
...@@ -334,6 +326,7 @@ Example: ...@@ -334,6 +326,7 @@ Example:
W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1 W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
$$ $$
)DOC"); )DOC");
Apply();
} }
void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -60,12 +61,27 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim, ...@@ -60,12 +61,27 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim,
// operator implementations can reuse the code. // operator implementations can reuse the code.
class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override; void Make() final;
protected:
virtual void Apply() {}
}; };
class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker { class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override; void Make() final;
protected:
virtual void Apply() {}
};
class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{
{"Input", /*->*/ "Output"}};
}
}; };
class ConvOp : public framework::OperatorWithKernel { class ConvOp : public framework::OperatorWithKernel {
......
...@@ -30,27 +30,30 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -30,27 +30,30 @@ class BoxCoderOp : public framework::OperatorWithKernel {
auto prior_box_dims = ctx->GetInputDim("PriorBox"); auto prior_box_dims = ctx->GetInputDim("PriorBox");
auto target_box_dims = ctx->GetInputDim("TargetBox"); auto target_box_dims = ctx->GetInputDim("TargetBox");
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, if (ctx->IsRuntime()) {
"The rank of Input of PriorBoxVar must be 2"); PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); "The rank of Input of PriorBoxVar must be 2");
if (ctx->HasInput("PriorBoxVar")) { PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
}
auto code_type =
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input of TargetBox must be 3");
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
}
} }
auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
if (code_type == BoxCodeType::kEncodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]");
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input of TargetBox must be 3");
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
}
ctx->SetOutputDim( ctx->SetOutputDim(
"OutputBox", "OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include <stdio.h> #include <stdio.h>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -67,17 +68,15 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, ...@@ -67,17 +68,15 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
// Allocate temporary storage // Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
void *d_temp_storage = memory::Alloc(place, temp_storage_bytes); auto d_temp_storage =
memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);
// Run sorting operation // Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
num); idx_out, num);
memory::Free(place, d_temp_storage);
} }
template <typename T> template <typename T>
......
...@@ -36,24 +36,26 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { ...@@ -36,24 +36,26 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
auto box_dims = ctx->GetInputDim("BBoxes"); auto box_dims = ctx->GetInputDim("BBoxes");
auto score_dims = ctx->GetInputDim("Scores"); auto score_dims = ctx->GetInputDim("Scores");
PADDLE_ENFORCE_EQ(box_dims.size(), 3, if (ctx->IsRuntime()) {
"The rank of Input(BBoxes) must be 3."); PADDLE_ENFORCE_EQ(box_dims.size(), 3,
PADDLE_ENFORCE_EQ(score_dims.size(), 3, "The rank of Input(BBoxes) must be 3.");
"The rank of Input(Scores) must be 3."); PADDLE_ENFORCE_EQ(score_dims.size(), 3,
PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 || "The rank of Input(Scores) must be 3.");
box_dims[2] == 24 || box_dims[2] == 32, PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 ||
"The 2nd dimension of Input(BBoxes) must be 4 or 8, " box_dims[2] == 16 || box_dims[2] == 24 ||
"represents the layout of coordinate " box_dims[2] == 32,
"[xmin, ymin, xmax, ymax] or " "The 2nd dimension of Input(BBoxes) must be 4 or 8, "
"4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " "represents the layout of coordinate "
"8 points: [xi, yi] i= 1,2,...,8 or " "[xmin, ymin, xmax, ymax] or "
"12 points: [xi, yi] i= 1,2,...,12 or " "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
"16 points: [xi, yi] i= 1,2,...,16"); "8 points: [xi, yi] i= 1,2,...,8 or "
PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2], "12 points: [xi, yi] i= 1,2,...,12 or "
"The 1st dimensiong of Input(BBoxes) must be equal to " "16 points: [xi, yi] i= 1,2,...,16");
"3rd dimension of Input(Scores), which represents the " PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
"predicted bboxes."); "The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the "
"predicted bboxes.");
}
// Here the box_dims[0] is not the real dimension of output. // Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel. // It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
......
...@@ -32,17 +32,20 @@ namespace paddle { ...@@ -32,17 +32,20 @@ namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
static void SerializeDestroyCallback(void* payload) {
if (payload != nullptr) {
auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
delete shared_payload;
}
}
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, const std::string& out_name, ::grpc::ByteBuffer* msg, const std::string& out_name,
const int trainer_id) { const int trainer_id) {
platform::RecordRPCEvent record_event("serial", &ctx); platform::RecordRPCEvent record_event("serial", &ctx);
// Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed.
DestroyCallback destroy_callback = [](void* backing) {};
VarMsg request; VarMsg request;
void* payload = nullptr; TensorPayload* payload = nullptr;
size_t payload_size;
request.set_varname(name); request.set_varname(name);
request.set_trainer_id(trainer_id); request.set_trainer_id(trainer_id);
...@@ -62,10 +65,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -62,10 +65,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} }
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
request.set_type(::sendrecv::LOD_TENSOR); request.set_type(::sendrecv::LOD_TENSOR);
GetTensorPayload(var, ctx, &request, &payload, &payload_size); payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
} else if (var->IsType<framework::SelectedRows>()) { } else if (var->IsType<framework::SelectedRows>()) {
request.set_type(::sendrecv::SELECTED_ROWS); request.set_type(::sendrecv::SELECTED_ROWS);
GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size); payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
} else if (var->IsType<ncclUniqueId>()) { } else if (var->IsType<ncclUniqueId>()) {
request.set_type(::sendrecv::NCCL_ID); request.set_type(::sendrecv::NCCL_ID);
...@@ -75,17 +78,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -75,17 +78,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
typeid(var->Type()).name()); typeid(var->Type()).name());
} }
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
// GPU data is copied to CPU buffer when sending,
// free the buffer when possible.
destroy_callback = [](void* backing) {
platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cuda_pinned, backing);
};
#endif
}
std::string header; std::string header;
request.AppendToString(&header); request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]); auto buffer = std::unique_ptr<char[]>(new char[1024]);
...@@ -109,16 +101,18 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -109,16 +101,18 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
return; return;
} }
#endif #endif
PADDLE_ENFORCE_NOT_NULL(payload);
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size());
// steal reference of tensor data // steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size()); slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size()); memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice( slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(),
static_cast<char*>(payload)), SerializeDestroyCallback, payload),
::grpc::Slice::STEAL_REF); ::grpc::Slice::STEAL_REF);
if (var->IsType<framework::SelectedRows>()) { if (var->IsType<framework::SelectedRows>()) {
......
...@@ -28,16 +28,34 @@ namespace distributed { ...@@ -28,16 +28,34 @@ namespace distributed {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
static TensorPayload GetCommunicationAllocationFromTensor(
const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
if (is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void* GetVarPayLoad(const std::string varname, int64_t size) { PADDLE_ENFORCE(is_gpu_place(tensor.place()));
platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx =
return memory::Alloc(cuda_pinned, size); reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
} auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
#endif platform::CUDAPinnedPlace cuda_pinned;
auto result = memory::AllocShared(
cuda_pinned, copy_size, memory::allocation::Allocator::kCrossDevice);
void GetTensorPayload(framework::Variable* var, memory::Copy(cuda_pinned, result->ptr(),
const platform::DeviceContext& ctx, VarMsg* request, boost::get<platform::CUDAPlace>(tensor.place()),
void** payload, size_t* payload_size) { tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
ctx.Wait();
return TensorPayload(result);
#else
PADDLE_THROW("This situation should not be happened");
#endif
} else {
return TensorPayload(tensor);
}
}
TensorPayload GetTensorPayload(framework::Variable* var,
const platform::DeviceContext& ctx,
VarMsg* request) {
auto tensor = var->Get<framework::LoDTensor>(); auto tensor = var->Get<framework::LoDTensor>();
// FIXME(wuyi): data types in send_recv.proto is copied from // FIXME(wuyi): data types in send_recv.proto is copied from
// framework.proto // framework.proto
...@@ -56,31 +74,12 @@ void GetTensorPayload(framework::Variable* var, ...@@ -56,31 +74,12 @@ void GetTensorPayload(framework::Variable* var,
} }
} }
} }
if (platform::is_gpu_place(ctx.GetPlace())) { return GetCommunicationAllocationFromTensor(ctx, tensor);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
// platform::CUDAPinnedPlace cuda_pinned;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
*payload = GetVarPayLoad(request->varname(), copy_size);
platform::CUDAPinnedPlace cuda_pinned;
memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
*payload = tensor.data<void>();
}
*payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
} }
void GetSelectedRowsPayload(framework::Variable* var, TensorPayload GetSelectedRowsPayload(framework::Variable* var,
const platform::DeviceContext& ctx, VarMsg* request, const platform::DeviceContext& ctx,
void** payload, size_t* payload_size) { VarMsg* request) {
auto* slr = var->GetMutable<framework::SelectedRows>(); auto* slr = var->GetMutable<framework::SelectedRows>();
request->set_data_type( request->set_data_type(
static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type()))); static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
...@@ -92,25 +91,20 @@ void GetSelectedRowsPayload(framework::Variable* var, ...@@ -92,25 +91,20 @@ void GetSelectedRowsPayload(framework::Variable* var,
} }
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) { return GetCommunicationAllocationFromTensor(ctx, *tensor);
#ifdef PADDLE_WITH_CUDA
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
*payload = GetVarPayLoad(request->varname(), copy_size);
platform::CUDAPinnedPlace cuda_pinned;
memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
*payload = slr->mutable_value()->data<void>();
}
*payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
} }
TensorPayload::TensorPayload(std::shared_ptr<memory::Allocation> allocation)
: allocation_(allocation), offset_(0), memory_size_(allocation->size()) {}
TensorPayload::TensorPayload(const framework::Tensor& tensor)
: allocation_(tensor.Holder()),
offset_(tensor.offset()),
memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {}
void* TensorPayload::ptr() const {
return reinterpret_cast<void*>(
reinterpret_cast<uintptr_t>(allocation_->ptr()) + offset_);
}
size_t TensorPayload::memory_size() const { return memory_size_; }
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -33,13 +33,30 @@ namespace distributed { ...@@ -33,13 +33,30 @@ namespace distributed {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
void GetTensorPayload(framework::Variable* var, class TensorPayload final {
const platform::DeviceContext& ctx, VarMsg* request, public:
void** payload, size_t* payload_size); explicit TensorPayload(const framework::Tensor& tensor);
explicit TensorPayload(std::shared_ptr<memory::Allocation> allocation);
void GetSelectedRowsPayload(framework::Variable* var, TensorPayload(const TensorPayload& o) = default;
const platform::DeviceContext& ctx, VarMsg* request, TensorPayload& operator=(const TensorPayload& o) = default;
void** payload, size_t* payload_size);
void* ptr() const;
size_t memory_size() const;
private:
std::shared_ptr<memory::Allocation> allocation_;
size_t offset_;
size_t memory_size_;
};
TensorPayload GetTensorPayload(framework::Variable* var,
const platform::DeviceContext& ctx,
VarMsg* request);
TensorPayload GetSelectedRowsPayload(framework::Variable* var,
const platform::DeviceContext& ctx,
VarMsg* request);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) { switch (type) {
......
...@@ -115,11 +115,11 @@ bool VariableResponse::CopyLodTensorData( ...@@ -115,11 +115,11 @@ bool VariableResponse::CopyLodTensorData(
void* tensor_data = void* tensor_data =
tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false;
}
return true; VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
<< ", Buffer Size = " << length;
PADDLE_ENFORCE_EQ(tensor->memory_size(), length);
return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
} }
inline framework::DDim GetDims( inline framework::DDim GetDims(
......
...@@ -17,6 +17,10 @@ limitations under the License. */ ...@@ -17,6 +17,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__)
#include "paddle/fluid/operators/math/jit_kernel.h"
#endif
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
...@@ -191,6 +195,8 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -191,6 +195,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
out.ShareDataWith(*y); out.ShareDataWith(*y);
out.Resize(matrix_shape); out.Resize(matrix_shape);
#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
defined(__OSX__)
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context()); RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
...@@ -217,6 +223,19 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -217,6 +223,19 @@ class LayerNormKernel : public framework::OpKernel<T> {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>( ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out); ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
} }
#else
PADDLE_ENFORCE_EQ(mean->numel(), left);
PADDLE_ENFORCE_EQ(var->numel(), left);
PADDLE_ENFORCE_EQ(scale->numel(), right);
PADDLE_ENFORCE_EQ(bias->numel(), right);
const auto& ker = math::jitkernel::KernelPool::Instance()
.template Get<math::jitkernel::LayerNormKernel<T>>(
static_cast<int>(right));
ker->Compute(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
scale->data<T>(), bias->data<T>(), static_cast<int>(left),
static_cast<const float>(epsilon));
#endif
} }
}; };
......
...@@ -72,12 +72,12 @@ cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_paddin ...@@ -72,12 +72,12 @@ cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_paddin
cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
if(WITH_GPU) if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
endif() endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
if (NOT WIN32) if (NOT WIN32)
set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
if(WITH_XBYAK) if(WITH_XBYAK)
list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
......
...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h" #include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,257 +59,83 @@ void VXXJitCode::generate() { ...@@ -60,257 +59,83 @@ void VXXJitCode::generate() {
offset += sizeof(float) * YMM_FLOAT_BLOCK; offset += sizeof(float) * YMM_FLOAT_BLOCK;
} }
int rest = num_ % YMM_FLOAT_BLOCK; int rest = num_ % YMM_FLOAT_BLOCK;
if (rest >= 4) { while (rest > 0) {
if (scalar_index_ != 1) { int block = XMM_FLOAT_BLOCK;
vmovups(xmm_src1, ptr[param1 + offset]); if (rest >= 4) {
} block = 4;
if (scalar_index_ != 2) { if (scalar_index_ != 1) {
vmovups(xmm_src2, ptr[param2 + offset]); vmovups(xmm_src1, ptr[param1 + offset]);
} }
if (type_ == operand_type::mul) { if (scalar_index_ != 2) {
vmulps(xmm_dst, xmm_src1, xmm_src2); vmovups(xmm_src2, ptr[param2 + offset]);
} else if (type_ == operand_type::add) { }
vaddps(xmm_dst, xmm_src1, xmm_src2); } else if (rest >= 2) {
} block = 2;
if (with_relu_) { if (scalar_index_ != 1) {
vmaxps(xmm_dst, xmm_zero, xmm_dst); vmovq(xmm_src1, ptr[param1 + offset]);
} }
vmovups(ptr[param3 + offset], xmm_dst); if (scalar_index_ != 2) {
offset += sizeof(float) * 4; vmovq(xmm_src2, ptr[param2 + offset]);
rest -= 4; }
} } else {
if (rest >= 2) { block = 1;
if (scalar_index_ != 1) { if (scalar_index_ != 1) {
vmovups(xmm_src1, ptr[param1 + offset]); vmovss(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovss(xmm_src2, ptr[param2 + offset]);
}
} }
if (scalar_index_ != 2) { switch (type_) {
vmovups(xmm_src2, ptr[param2 + offset]); case operand_type::mul:
} vmulps(xmm_dst, xmm_src1, xmm_src2);
if (type_ == operand_type::mul) { break;
vmulps(xmm_dst, xmm_src1, xmm_src2); case operand_type::add:
} else if (type_ == operand_type::add) { vaddps(xmm_dst, xmm_src1, xmm_src2);
vaddps(xmm_dst, xmm_src1, xmm_src2); break;
default:
break;
} }
if (with_relu_) { if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst); vmaxps(xmm_dst, xmm_zero, xmm_dst);
} }
vmovq(ptr[param3 + offset], xmm_dst); if (rest >= 4) {
offset += sizeof(float) * 2; vmovups(ptr[param3 + offset], xmm_dst);
rest -= 2; } else if (rest >= 2) {
} vmovq(ptr[param3 + offset], xmm_dst);
if (rest > 0) { } else {
if (scalar_index_ != 1) { vmovss(ptr[param3 + offset], xmm_dst);
vmovups(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovups(xmm_src2, ptr[param2 + offset]);
}
if (type_ == operand_type::mul) {
vmulss(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddss(xmm_dst, xmm_src1, xmm_src2);
} }
if (with_relu_) { offset += sizeof(float) * block;
vmaxps(xmm_dst, xmm_zero, xmm_dst); rest -= block;
}
vmovss(ptr[param3 + offset], xmm_dst);
} }
ret(); ret();
} }
#define ALIGN32 __attribute__((aligned(32))) const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
#define EXP_HIG 88.3762626647949f REPEAT_8TIMES(2.f),
#define EXP_LOW -88.3762626647949f REPEAT_8TIMES(0.5f),
#define CEPHES_LOG2EF 1.44269504088896341 REPEAT_8TIMES(EXP_HIG),
#define CEPHES_EXP_C1 0.693359375 REPEAT_8TIMES(EXP_LOW),
#define CEPHES_EXP_C2 -2.12194440e-4 REPEAT_8TIMES(CEPHES_LOG2EF),
#define CEPHES_EXP_P0 1.9875691500E-4 REPEAT_8TIMES(CEPHES_EXP_C1),
#define CEPHES_EXP_P1 1.3981999507E-3 REPEAT_8TIMES(CEPHES_EXP_C2),
#define CEPHES_EXP_P2 8.3334519073E-3 REPEAT_8TIMES(CEPHES_EXP_P0),
#define CEPHES_EXP_P3 4.1665795894E-2 REPEAT_8TIMES(CEPHES_EXP_P1),
#define CEPHES_EXP_P4 1.6666665459E-1 REPEAT_8TIMES(CEPHES_EXP_P2),
#define CEPHES_EXP_P5 5.0000001201E-1 REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) int g_tmp_mem[16] ALIGN32 = {0};
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
static const float exp_float_consts[] ALIGN32 = {
REPEAT_8TIMES(1.f),
REPEAT_8TIMES(2.f),
REPEAT_8TIMES(0.5f),
REPEAT_8TIMES(EXP_HIG),
REPEAT_8TIMES(EXP_LOW),
REPEAT_8TIMES(CEPHES_LOG2EF),
REPEAT_8TIMES(CEPHES_EXP_C1),
REPEAT_8TIMES(CEPHES_EXP_C2),
REPEAT_8TIMES(CEPHES_EXP_P0),
REPEAT_8TIMES(CEPHES_EXP_P1),
REPEAT_8TIMES(CEPHES_EXP_P2),
REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P4),
REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(EXP_MAX_INPUT),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
static int g_tmp_mem[16] ALIGN32 = {0};
bool VActJitCode::init(int d, operand_type type) { bool VActJitCode::init(int d, operand_type type) {
bool ok = MayIUse(avx); // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
if (type == operand_type::relu) { return MayIUse(avx);
return ok;
} else if (type == operand_type::exp) {
// exp is slower than mkl when d >= 256
return ok && d % 8 == 0 && d < 256;
} else {
// TODO(TJ): support more
return ok && d % 8 == 0;
}
}
void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
vmaxps(ymm_dst, ymm_zero, ymm_src);
}
void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore
// check all idx can not equal
ymm_t ymm_fx = ymm_t(fx_idx);
ymm_t ymm_fy = ymm_t(fy_idx);
ymm_t ymm_mask = ymm_t(mask_idx);
ymm_t ymm_tmp = ymm_t(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
vminps(ymm_src, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
vmaxps(ymm_src, ymm_src, ymm_tmp);
// express exp(x) as exp(g + n*log(2))
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
vmulps(ymm_fx, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
vaddps(ymm_fx, ymm_fx, ymm_tmp);
vroundps(ymm_fy, ymm_fx, 0x01);
// if greater, substract 1
vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
vmovaps(ymm_tmp, ptr[reg_ptr_global]);
vandps(ymm_mask, ymm_mask, ymm_tmp);
vsubps(ymm_fx, ymm_fy, ymm_mask);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
vmulps(ymm_fy, ymm_fx, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
vmulps(ymm_z, ymm_fx, ymm_tmp);
vsubps(ymm_src, ymm_src, ymm_fy);
vsubps(ymm_src, ymm_src, ymm_z);
vmulps(ymm_z, ymm_src, ymm_src);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
vmulps(ymm_dst, ymm_src, ymm_tmp);
for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
i += (YMM_FLOAT_BLOCK * sizeof(float))) {
vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmulps(ymm_dst, ymm_dst, ymm_src);
}
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmulps(ymm_dst, ymm_dst, ymm_z);
vaddps(ymm_dst, ymm_dst, ymm_src);
vmovaps(ymm_tmp, ptr[reg_ptr_global]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
// build 2^n
ymm_t ymm_int = ymm_fx;
vcvttps2dq(ymm_int, ymm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2)) {
vpaddd(ymm_int, ymm_int, ymm_tmp);
vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global;
mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
vmovdqa(ptr[reg_ptr_tmp], ymm_int);
vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp], xtmp1);
// next 128bits
vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
vmovdqa(xtmp2,
ptr[reg_ptr_tmp +
(YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
// load out
vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
}
vmulps(ymm_dst, ymm_dst, ymm_int);
pop(reg_ptr_global);
}
void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
// y = 1 / (1 + e^-x)
ymm_t ymm_tmp = ymm_t(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
vminps(ymm_src, ymm_src, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
vmaxps(ymm_src, ymm_src, ymm_tmp);
vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
vsubps(ymm_src, ymm_tmp, ymm_src);
exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vdivps(ymm_dst, ymm_tmp, ymm_dst);
pop(reg_ptr_global);
}
void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
int fy_idx, int mask_idx, int tmp_idx) {
// y = 2 / (1 + e^(-2x)) - 1
ymm_t ymm_tmp = ymm_t(tmp_idx);
ymm_t ymm_zero = ymm_t(mask_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vxorps(ymm_zero, ymm_zero, ymm_zero);
vsubps(ymm_tmp, ymm_zero, ymm_tmp);
vmulps(ymm_src, ymm_src, ymm_tmp);
exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(ymm_dst, ymm_dst, ymm_tmp);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vdivps(ymm_dst, ymm_tmp, ymm_dst);
vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vsubps(ymm_dst, ymm_dst, ymm_tmp);
pop(reg_ptr_global);
} }
void VActJitCode::generate() { void VActJitCode::generate() {
...@@ -324,16 +149,16 @@ void VActJitCode::generate() { ...@@ -324,16 +149,16 @@ void VActJitCode::generate() {
vmovups(ymm_src, ptr[param1 + offset]); vmovups(ymm_src, ptr[param1 + offset]);
switch (type_) { switch (type_) {
case operand_type::relu: case operand_type::relu:
relu_ymm(ymm_dst, ymm_src, ymm_zero); relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
break; break;
case operand_type::exp: case operand_type::exp:
exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::sigmoid: case operand_type::sigmoid:
sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::tanh: case operand_type::tanh:
tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
break; break;
case operand_type::identity: case operand_type::identity:
break; break;
...@@ -343,30 +168,44 @@ void VActJitCode::generate() { ...@@ -343,30 +168,44 @@ void VActJitCode::generate() {
vmovups(ptr[param2 + offset], ymm_dst); vmovups(ptr[param2 + offset], ymm_dst);
offset += sizeof(float) * YMM_FLOAT_BLOCK; offset += sizeof(float) * YMM_FLOAT_BLOCK;
} }
if (type_ != operand_type::relu) {
// TODO(TJ): remove me
ret();
return;
}
int rest = num_ % YMM_FLOAT_BLOCK; int rest = num_ % YMM_FLOAT_BLOCK;
if (rest >= 4) { while (rest > 0) {
vmovups(xmm_src, ptr[param1 + offset]); int block = XMM_FLOAT_BLOCK;
vmaxps(xmm_dst, xmm_zero, xmm_src); if (rest >= 4) {
vmovups(ptr[param2 + offset], xmm_dst); block = 4;
offset += sizeof(float) * 4; vmovups(xmm_src, ptr[param1 + offset]);
rest -= 4; } else if (rest >= 2) {
} block = 2;
if (rest >= 2) { vmovq(xmm_src, ptr[param1 + offset]);
vmovups(xmm_src, ptr[param1 + offset]); } else {
vmaxps(xmm_dst, xmm_zero, xmm_src); block = 1;
vmovq(ptr[param2 + offset], xmm_dst); vmovss(xmm_src, ptr[param1 + offset]);
offset += sizeof(float) * 2; }
rest -= 2; switch (type_) {
} case operand_type::relu:
if (rest > 0) { relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
vmovups(xmm_src, ptr[param1 + offset]); break;
vmaxps(xmm_dst, xmm_zero, xmm_src); case operand_type::exp:
vmovss(ptr[param2 + offset], xmm_dst); exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
case operand_type::sigmoid:
sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
case operand_type::tanh:
tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
break;
default:
break;
}
if (rest >= 4) {
vmovups(ptr[param2 + offset], xmm_dst);
} else if (rest >= 2) {
vmovq(ptr[param2 + offset], xmm_dst);
} else {
vmovss(ptr[param2 + offset], xmm_dst);
}
offset += sizeof(float) * block;
rest -= block;
} }
ret(); ret();
} }
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/jit_gen.h" #include "paddle/fluid/operators/math/jit_gen.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -40,6 +42,51 @@ typedef enum { ...@@ -40,6 +42,51 @@ typedef enum {
identity identity
} operand_type; } operand_type;
extern const float exp_float_consts[];
extern const int exp_int_0x7f[];
extern int g_tmp_mem[];
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_EXP_C1 0.693359375
#define CEPHES_EXP_C2 -2.12194440e-4
#define CEPHES_EXP_P0 1.9875691500E-4
#define CEPHES_EXP_P1 1.3981999507E-3
#define CEPHES_EXP_P2 8.3334519073E-3
#define CEPHES_EXP_P3 4.1665795894E-2
#define CEPHES_EXP_P4 1.6666665459E-1
#define CEPHES_EXP_P5 5.0000001201E-1
#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
class VXXJitCode : public JitCode { class VXXJitCode : public JitCode {
public: public:
...@@ -127,21 +174,140 @@ class VActJitCode : public JitCode { ...@@ -127,21 +174,140 @@ class VActJitCode : public JitCode {
void generate() override; void generate() override;
protected: protected:
// compute relu with ymm // compute relu with ymm, xmm
void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, template <typename JMM>
const Xbyak::Ymm& zero); void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT
vmaxps(dst, src, zero);
}
// compute exp with ymm // compute exp with ymm, xmm
void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT
int mask_idx = 4, int tmp_idx = 5) {
using namespace platform::jit; // NOLINT
assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore
// check all idx can not equal
JMM jmm_fx = JMM(fx_idx);
JMM jmm_fy = JMM(fy_idx);
JMM jmm_mask = JMM(mask_idx);
JMM jmm_tmp = JMM(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
vminps(src, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
vmaxps(src, src, jmm_tmp);
// express exp(x) as exp(g + n*log(2))
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
vmulps(jmm_fx, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
vaddps(jmm_fx, jmm_fx, jmm_tmp);
vroundps(jmm_fy, jmm_fx, 0x01);
// if greater, substract 1
vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
vmovaps(jmm_tmp, ptr[reg_ptr_global]);
vandps(jmm_mask, jmm_mask, jmm_tmp);
vsubps(jmm_fx, jmm_fy, jmm_mask);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
vmulps(jmm_fy, jmm_fx, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
JMM ymm_z = JMM(jmm_mask.getIdx());
vmulps(ymm_z, jmm_fx, jmm_tmp);
vsubps(src, src, jmm_fy);
vsubps(src, src, ymm_z);
vmulps(ymm_z, src, src);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
vmulps(dst, src, jmm_tmp);
for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
i += (YMM_FLOAT_BLOCK * sizeof(float))) {
vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4
vaddps(dst, dst, jmm_tmp);
vmulps(dst, dst, src);
}
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
vaddps(dst, dst, jmm_tmp);
vmulps(dst, dst, ymm_z);
vaddps(dst, dst, src);
vmovaps(jmm_tmp, ptr[reg_ptr_global]);
vaddps(dst, dst, jmm_tmp);
// build 2^n
JMM ymm_int = jmm_fx;
vcvttps2dq(ymm_int, jmm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
vpaddd(ymm_int, ymm_int, jmm_tmp);
vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global;
mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
vmovdqa(ptr[reg_ptr_tmp], ymm_int);
vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp], xtmp1);
// next 128bits
vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
vmovdqa(xtmp2, ptr[reg_ptr_tmp +
(YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
vpaddd(xtmp1, xtmp1, xtmp2);
vpslld(xtmp1, xtmp1, 23);
vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
// load out
vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
}
vmulps(dst, dst, ymm_int);
pop(reg_ptr_global);
}
// compute sigmoid with ymm // compute sigmoid with ymm, xmm
void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
// y = 1 / (1 + e^-x)
JMM jmm_tmp = JMM(tmp_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
vminps(src, src, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
vmaxps(src, src, jmm_tmp);
vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
vsubps(src, jmm_tmp, src);
exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(dst, dst, jmm_tmp);
vdivps(dst, jmm_tmp, dst);
pop(reg_ptr_global);
}
// compute tanh with ymm // compute tanh with ymm, xmm
void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, template <typename JMM>
int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT
int mask_idx = 4, int tmp_idx = 5) {
// y = 2 / (1 + e^(-2x)) - 1
JMM jmm_tmp = JMM(tmp_idx);
JMM jmm_zero = JMM(mask_idx);
reg64_t reg_ptr_global = rax;
push(reg_ptr_global);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vxorps(jmm_zero, jmm_zero, jmm_zero);
vsubps(jmm_tmp, jmm_zero, jmm_tmp);
vmulps(src, src, jmm_tmp);
exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vaddps(dst, dst, jmm_tmp);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
vdivps(dst, jmm_tmp, dst);
vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
vsubps(dst, dst, jmm_tmp);
pop(reg_ptr_global);
}
protected: protected:
int num_; int num_;
......
...@@ -26,6 +26,7 @@ namespace operators { ...@@ -26,6 +26,7 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
// TODO(TJ): move these to some proper place
#define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0 #define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0 #define EXP_MAX_INPUT 40.0
...@@ -145,6 +146,14 @@ class CRFDecodeKernel : public Kernel { ...@@ -145,6 +146,14 @@ class CRFDecodeKernel : public Kernel {
int *track) const = 0; int *track) const = 0;
}; };
template <typename T>
class LayerNormKernel : public Kernel {
public:
virtual void Compute(T *x, T *out, T *mean, T *var, const T *scale,
const T *bias, int height,
const float epsilon) const = 0;
};
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <math.h>
#include <limits>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
/* Layer Norm JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class LayerNormKernelImpl : public LayerNormKernel<T> {
public:
explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
this->num_ = right;
}
void Compute(T* x, T* out, T* mean, T* var, const T* scale, const T* bias,
int height, const float epsilon) const override {
// get mean
for (int i = 0; i < height; i++) {
T sum = 0.0;
int offset = i * this->num_;
for (int j = 0; j < this->num_; j++) {
sum += x[offset + j];
}
mean[i] = sum / this->num_;
}
// get variance
for (int i = 0; i < height; i++) {
T sum = 0.0;
int offset = i * this->num_;
for (int j = 0; j < this->num_; j++) {
sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
}
var[i] = sum / this->num_;
}
for (int i = 0; i < height; i++) {
int offset = i * this->num_;
T sqrt_var = sqrt(var[i] + (T)epsilon);
for (int j = 0; j < this->num_; j++) {
out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
}
}
if (scale) {
for (int i = 0; i < height; i++) {
int offset = i * this->num_;
for (int j = 0; j < this->num_; j++) {
out[offset + j] *= scale[j];
}
}
}
if (bias) {
for (int i = 0; i < height; i++) {
int offset = i * this->num_;
for (int j = 0; j < this->num_; j++) {
out[offset + j] += bias[j];
}
}
}
}
};
#define INTRIAVX_FLOAT(isa, block) \
template <> \
LayerNormKernelImpl<float, isa, block>::LayerNormKernelImpl(int right) \
: LayerNormKernel<float>() { \
this->num_ = right; \
this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \
this->end_ = this->num_ - this->rest_; \
} \
template <> \
void LayerNormKernelImpl<float, jit::avx, block>::Compute( \
float* x, float* out, float* mean, float* var, const float* scale, \
const float* bias, int height, const float epsilon) const { \
__m256 sum; \
__m256 mean_vec, var_vec; \
__m128 hi, lo; \
__m256 tmp; \
size_t offset; \
size_t j; \
__m256 reverse_num_vec = \
_mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_)); \
__m256 epsilon_vec = _mm256_set1_ps(epsilon); \
int rest_mask = \
((-1) & (~((~0U) >> (sizeof(int) * 8 - (YMM_FLOAT_BLOCK - rest_))))) & \
0x0ff; \
__m256i mask_vec = _mm256_set_epi32( \
rest_mask & 0x80 ? 0xffffffff : 0, rest_mask & 0x40 ? 0xffffffff : 0, \
rest_mask & 0x20 ? 0xffffffff : 0, rest_mask & 0x10 ? 0xffffffff : 0, \
rest_mask & 0x8 ? 0xffffffff : 0, rest_mask & 0x4 ? 0xffffffff : 0, \
rest_mask & 0x2 ? 0xffffffff : 0, rest_mask & 0x1 ? 0xffffffff : 0); \
\
for (int i = 0; i < height; ++i) { \
offset = i * this->num_; \
\
/* get mean */ \
sum = _mm256_setzero_ps(); \
for (j = offset; j < end_ + offset; j += block) { \
sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j)); \
} \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)x + j); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \
sum = _mm256_add_ps(sum, tmp); \
} \
hi = _mm256_extractf128_ps(sum, 1); \
lo = _mm256_extractf128_ps(sum, 0); \
sum = _mm256_add_ps( \
sum, _mm256_insertf128_ps( \
_mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1)); \
sum = _mm256_hadd_ps(sum, sum); \
sum = _mm256_hadd_ps(sum, sum); \
mean_vec = _mm256_mul_ps(sum, reverse_num_vec); \
mean[i] = *reinterpret_cast<float*>(&mean_vec); \
\
/* get variance */ \
sum = _mm256_setzero_ps(); \
for (j = offset; j < end_ + offset; j += block) { \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_mul_ps(tmp, tmp); \
sum = _mm256_add_ps(sum, tmp); \
} \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_mul_ps(tmp, tmp); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \
sum = _mm256_add_ps(sum, tmp); \
} \
hi = _mm256_extractf128_ps(sum, 1); \
lo = _mm256_extractf128_ps(sum, 0); \
sum = _mm256_add_ps( \
sum, _mm256_insertf128_ps( \
_mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1)); \
sum = _mm256_hadd_ps(sum, sum); \
sum = _mm256_hadd_ps(sum, sum); \
var_vec = _mm256_mul_ps(sum, reverse_num_vec); \
var[i] = *reinterpret_cast<float*>(&var_vec); \
\
/* get x_norm and calculate output*/ \
for (j = offset; j < end_ + offset; j += block) { \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_div_ps( \
tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec))); \
_mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp); \
} \
if (rest_ != 0) { \
j = offset + num_ - block; \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_div_ps( \
tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec))); \
_mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp); \
} \
\
if (scale) { \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)out + j); \
} \
for (j = offset; j < end_ + offset; j += block) { \
_mm256_storeu_ps( \
reinterpret_cast<float*>(out) + j, \
_mm256_mul_ps( \
_mm256_loadu_ps((const float*)out + j), \
_mm256_loadu_ps((const float*)scale + j - offset))); \
} \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
_mm256_storeu_ps( \
reinterpret_cast<float*>(out) + j, \
_mm256_mul_ps( \
tmp, _mm256_loadu_ps((const float*)scale + j - offset))); \
} \
} \
\
if (bias) { \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)out + j); \
} \
for (j = offset; j < end_ + offset; j += block) { \
_mm256_storeu_ps( \
reinterpret_cast<float*>(out) + j, \
_mm256_add_ps( \
_mm256_loadu_ps((const float*)out + j), \
_mm256_loadu_ps((const float*)bias + j - offset))); \
} \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
_mm256_storeu_ps( \
reinterpret_cast<float*>(out) + j, \
_mm256_add_ps( \
tmp, _mm256_loadu_ps((const float*)bias + j - offset))); \
} \
} \
} \
}
#ifdef __AVX__
INTRIAVX_FLOAT(jit::avx, kEQ8);
INTRIAVX_FLOAT(jit::avx, kGT8LT16);
INTRIAVX_FLOAT(jit::avx, kEQ16);
INTRIAVX_FLOAT(jit::avx, kGT16);
#endif
#ifdef __AVX2__
INTRIAVX_FLOAT(jit::avx2, kEQ8);
INTRIAVX_FLOAT(jit::avx2, kGT8LT16);
INTRIAVX_FLOAT(jit::avx2, kEQ16);
INTRIAVX_FLOAT(jit::avx2, kGT16);
#endif
#undef INTRIAVX_FLOAT
REGISTER_JITKERNEL_DEPRECATED(layer_norm, LayerNormKernel);
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -33,6 +33,9 @@ limitations under the License. */ ...@@ -33,6 +33,9 @@ limitations under the License. */
constexpr int repeat = 20000; constexpr int repeat = 20000;
// TODO(TJ): benchmark and test should be seperated,
// benchmark should verify more sizes
inline double GetCurrentUS() { inline double GetCurrentUS() {
struct timeval time; struct timeval time;
gettimeofday(&time, NULL); gettimeofday(&time, NULL);
...@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { ...@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
TEST(JitKernel, vrelu) { TEST(JitKernel, vrelu) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) { for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -10.f, 1.f); RandomVec<float>(d, x.data(), -10.f, 1.f);
...@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) { ...@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST(JitKernel, vexp) { TEST(JitKernel, vexp) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 128, 256}) { for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -231,7 +234,7 @@ void vsigmoid_better( ...@@ -231,7 +234,7 @@ void vsigmoid_better(
TEST(JitKernel, vsigmoid) { TEST(JitKernel, vsigmoid) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -295,7 +298,7 @@ void vtanh_better( ...@@ -295,7 +298,7 @@ void vtanh_better(
TEST(JitKernel, vtanh) { TEST(JitKernel, vtanh) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
...@@ -386,7 +389,7 @@ void lstm_ctht_better( ...@@ -386,7 +389,7 @@ void lstm_ctht_better(
TEST(JitKernel, lstm) { TEST(JitKernel, lstm) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
int d4 = d * 4; int d4 = d * 4;
int d3 = d * 3; int d3 = d * 3;
std::vector<float> x(d4), xref(d4); std::vector<float> x(d4), xref(d4);
...@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) { ...@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
float* zref_data = zref.data(); float* zref_data = zref.data();
auto trefs = GetCurrentUS(); auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
vadd_ref(d, x_data, y_data, zref_data); vaddrelu_ref(d, x_data, y_data, zref_data);
} }
auto trefe = GetCurrentUS(); auto trefe = GetCurrentUS();
auto tmkls = GetCurrentUS(); auto tmkls = GetCurrentUS();
......
...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
...@@ -38,6 +38,7 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -38,6 +38,7 @@ TEST(selected_rows_functor, gpu_add) {
{static_cast<int64_t>(rows1.size()), row_numel}), {static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place); gpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
PADDLE_ENFORCE(cudaDeviceSynchronize());
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{ std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
......
...@@ -19,7 +19,8 @@ namespace paddle { ...@@ -19,7 +19,8 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceContext, typename T, bool is_test> template <typename DeviceContext, typename T, bool is_test,
typename Enable = void>
class SoftmaxFunctor { class SoftmaxFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -32,8 +33,8 @@ struct ValueClip { ...@@ -32,8 +33,8 @@ struct ValueClip {
} }
}; };
template <typename DeviceContext, typename T, bool is_test> template <typename DeviceContext, typename T, bool is_test, typename Enable>
void SoftmaxFunctor<DeviceContext, T, is_test>::operator()( void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
const DeviceContext& context, const framework::Tensor* X, const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X); auto logits = EigenMatrix<T>::From(*X);
...@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()( ...@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
.broadcast(one_by_class)); .broadcast(one_by_class));
} }
template <typename DeviceContext, typename T> template <class DeviceContext>
class SoftmaxFunctor<DeviceContext, T, true> { using enable_if_CPU = typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
template <typename DeviceContext>
class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X); auto in_dims = X->dims();
auto softmax = EigenMatrix<T>::From(*Y); auto out_dims = Y->dims();
const float* in_data = X->data<float>();
float* out_data = Y->data<float>();
const int kBatchDim = 0; const int kBatchDim = 0;
const int kClassDim = 1; const int kClassDim = 1;
// 2D data. Batch x C
const int batch_size = logits.dimension(kBatchDim); const int batch_size = in_dims[kBatchDim];
const int num_classes = logits.dimension(kClassDim); const int num_classes = in_dims[kClassDim];
std::vector<float> entities(batch_size);
Eigen::DSizes<int, 1> along_class(kClassDim); auto blas = math::GetBlas<DeviceContext, float>(context);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1); for (int n = 0; n < batch_size; ++n) {
Eigen::DSizes<int, 2> one_by_class(1, num_classes); entities[n] = in_data[n * num_classes];
for (int c = 1; c < num_classes; ++c) {
auto shifted_logits = (logits - entities[n] = in_data[n * num_classes + c] > entities[n]
logits.maximum(along_class) ? in_data[n * num_classes + c]
.eval() : entities[n];
.reshape(batch_by_one) }
.broadcast(one_by_class)); for (int c = 0; c < num_classes; ++c) {
out_data[n * num_classes + c] =
softmax.device(*context.eigen_device()) = shifted_logits.exp(); in_data[n * num_classes + c] - entities[n];
softmax.device(*context.eigen_device()) = (softmax * }
softmax.sum(along_class) }
.inverse()
.eval() blas.VEXP(num_classes * batch_size, out_data, out_data);
.reshape(batch_by_one) for (int n = 0; n < batch_size; ++n) {
.broadcast(one_by_class)); entities[n] = out_data[n * num_classes];
for (int c = 1; c < num_classes; ++c) {
entities[n] += out_data[n * num_classes + c];
}
blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
}
} }
}; };
......
...@@ -32,7 +32,7 @@ class PReluKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class PReluKernel : public framework::OpKernel<T> {
T* o_ptr = out->mutable_data<T>(context.GetPlace()); T* o_ptr = out->mutable_data<T>(context.GetPlace());
const T* alpha_ptr = alpha->data<T>(); const T* alpha_ptr = alpha->data<T>();
std::string mode = context.Attr<std::string>("mode"); auto& mode = context.Attr<std::string>("mode");
int numel = x->numel(); int numel = x->numel();
auto dim = x->dims(); auto dim = x->dims();
...@@ -99,6 +99,8 @@ class PReluGradKernel : public framework::OpKernel<T> { ...@@ -99,6 +99,8 @@ class PReluGradKernel : public framework::OpKernel<T> {
index = 0; index = 0;
if (dalpha) { if (dalpha) {
T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace()); T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
if (mode == "channel") { if (mode == "channel") {
for (i = 0; i < numel; i++) { for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]); temp = numel / (dim[0] * dim[1]);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
namespace paddle { namespace paddle {
...@@ -33,11 +34,7 @@ class RecordIOFileReader : public framework::FileReader { ...@@ -33,11 +34,7 @@ class RecordIOFileReader : public framework::FileReader {
protected: protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
std::unique_ptr<std::lock_guard<std::mutex>> guard; platform::LockGuardPtr<std::mutex> guard(mutex_);
if (ThreadSafe) {
guard.reset(new std::lock_guard<std::mutex>(*mutex_));
}
bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out); bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
if (!ok) { if (!ok) {
out->clear(); out->clear();
......
...@@ -21,42 +21,38 @@ limitations under the License. */ ...@@ -21,42 +21,38 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
TEST(scatter, ScatterUpdate) { TEST(scatter, ScatterUpdate) {
// using namespace paddle::framework; paddle::framework::Tensor src;
// using namespace paddle::platform; paddle::framework::Tensor index;
// using namespace paddle::operators; paddle::framework::Tensor output;
paddle::framework::Tensor* src = new paddle::framework::Tensor(); auto* p_src = src.mutable_data<float>(paddle::framework::make_ddim({1, 4}),
paddle::framework::Tensor* index = new paddle::framework::Tensor(); paddle::platform::CPUPlace());
paddle::framework::Tensor* output = new paddle::framework::Tensor(); auto* p_index = index.mutable_data<int>(paddle::framework::make_ddim({1}),
paddle::platform::CPUPlace());
float* p_src = nullptr;
int* p_index = nullptr; for (size_t i = 0; i < 4; ++i) {
p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}), p_src[i] = static_cast<float>(i);
paddle::platform::CPUPlace()); }
p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
paddle::platform::CPUPlace());
for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
p_index[0] = 1; p_index[0] = 1;
float* p_output = output->mutable_data<float>( auto* p_output = output.mutable_data<float>(
paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace()); paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
for (int64_t i = 0; i < output.numel(); ++i) {
p_output[i] = 0;
}
auto* cpu_place = new paddle::platform::CPUPlace(); auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::platform::CPUDeviceContext ctx(*cpu_place);
paddle::operators::ScatterAssign<float>(ctx, *src, *index, output); paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
for (size_t i = 4; i < 8; ++i) { for (size_t i = 4; i < 8; ++i) {
EXPECT_EQ(p_output[i], static_cast<float>(i - 4)); EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
} }
for (size_t i = 4; i < 8; ++i) for (size_t i = 4; i < 8; ++i)
EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4)); EXPECT_EQ(output.data<float>()[i], static_cast<float>(i - 4));
for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
delete src;
delete index;
delete output;
} }
...@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> { ...@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
#ifdef ON_INFER #ifdef PADDLE_ON_INFERENCE
math::SoftmaxFunctor<DeviceContext, T, true>()( math::SoftmaxFunctor<
DeviceContext, T,
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d); context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
#else #else
math::SoftmaxFunctor<DeviceContext, T, false>()( math::SoftmaxFunctor<DeviceContext, T, false>()(
......
...@@ -87,13 +87,16 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -87,13 +87,16 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); auto src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1}); framework::DDim src_stride({5, 1});
int dst[4]; int dst[4];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst))); auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1}); framework::DDim dst_stride({2, 1});
...@@ -108,9 +111,6 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -108,9 +111,6 @@ TEST(StridedMemcpy, GPUCrop) {
ASSERT_EQ(2, dst[1]); ASSERT_EQ(2, dst[1]);
ASSERT_EQ(3, dst[2]); ASSERT_EQ(3, dst[2]);
ASSERT_EQ(4, dst[3]); ASSERT_EQ(4, dst[3]);
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
} }
TEST(StridedMemcpy, GPUConcat) { TEST(StridedMemcpy, GPUConcat) {
...@@ -124,12 +124,13 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -124,12 +124,13 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8]; int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst))); auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
framework::DDim src_stride({2, 1}); framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
...@@ -151,9 +152,6 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -151,9 +152,6 @@ TEST(StridedMemcpy, GPUConcat) {
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]); ASSERT_EQ(expect_dst[i], dst[i]);
} }
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
} }
#endif #endif
......
...@@ -73,3 +73,4 @@ cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) ...@@ -73,3 +73,4 @@ cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
IF(WITH_GPU) IF(WITH_GPU)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
ENDIF() ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
...@@ -56,10 +56,17 @@ DEFINE_double( ...@@ -56,10 +56,17 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline size_t CpuTotalPhysicalMemory() { size_t CpuTotalPhysicalMemory() {
#ifdef __APPLE__ #ifdef __APPLE__
int mib[2]; int mib[2];
mib[0] = CTL_HW; mib[0] = CTL_HW;
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine. //! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize(); size_t CpuMaxAllocSize();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle {
namespace platform {
// Even this source file does not contains any code, it is better to keep this
// source file for cmake dependency.
} // namespace platform
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace platform {
class CUDADeviceGuard {
public:
explicit inline CUDADeviceGuard(int dev_id) {
int prev_id = platform::GetCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
platform::SetDeviceId(dev_id);
}
}
inline ~CUDADeviceGuard() {
if (prev_id_ != -1) {
platform::SetDeviceId(prev_id_);
}
}
CUDADeviceGuard(const CUDADeviceGuard& o) = delete;
CUDADeviceGuard& operator=(const CUDADeviceGuard& o) = delete;
private:
int prev_id_{-1};
};
} // namespace platform
} // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
...@@ -81,6 +82,16 @@ enum class PoolingMode { ...@@ -81,6 +82,16 @@ enum class PoolingMode {
kAverageInclusive, kAverageInclusive,
}; };
enum ActivationMode {
kNone, // activation identity
kSigmoid,
kRelu,
kRelu6,
kReluX,
kTanh,
kBandPass,
};
#if CUDNN_VERSION < 6000 #if CUDNN_VERSION < 6000
#pragma message "CUDNN version under 6.0 is supported at best effort." #pragma message "CUDNN version under 6.0 is supported at best effort."
#pragma message "We strongly encourage you to move to 6.0 and above." #pragma message "We strongly encourage you to move to 6.0 and above."
...@@ -120,6 +131,26 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { ...@@ -120,6 +131,26 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
} }
#endif // CUDNN_VERSION < 6000 #endif // CUDNN_VERSION < 6000
inline ActivationMode StringToActivationMode(const std::string& str) {
if (str == "identity") {
return ActivationMode::kNone;
} else if (str == "sigmoid") {
return ActivationMode::kSigmoid;
} else if (str == "relu") {
return ActivationMode::kRelu;
} else if (str == "relu6") {
return ActivationMode::kRelu6;
} else if (str == "relux") {
return ActivationMode::kReluX;
} else if (str == "tanh") {
return ActivationMode::kTanh;
} else if (str == "bandpass") {
return ActivationMode::kBandPass;
} else {
PADDLE_THROW("Unknown activation string: %s", str);
}
}
template <typename T> template <typename T>
class CudnnDataType; class CudnnDataType;
...@@ -368,6 +399,58 @@ class ScopedSpatialTransformerDescriptor { ...@@ -368,6 +399,58 @@ class ScopedSpatialTransformerDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
}; };
class ScopedActivationDescriptor {
public:
ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&desc_));
}
~ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(desc_));
}
template <typename T>
inline cudnnActivationDescriptor_t descriptor(
const std::string& act, double value_max = static_cast<double>(0.)) {
double relu_ceiling = 0.0;
ActivationMode activation_mode = StringToActivationMode(act);
cudnnActivationMode_t mode;
switch (activation_mode) {
#if CUDNN_VERSION >= 7100
case ActivationMode::kNone:
mode = CUDNN_ACTIVATION_IDENTITY;
break;
#endif
case ActivationMode::kRelu6:
relu_ceiling = 6.0;
mode = CUDNN_ACTIVATION_CLIPPED_RELU;
break;
case ActivationMode::kReluX:
relu_ceiling = value_max;
mode = CUDNN_ACTIVATION_CLIPPED_RELU;
break;
case ActivationMode::kRelu:
mode = CUDNN_ACTIVATION_RELU;
break;
case ActivationMode::kSigmoid:
mode = CUDNN_ACTIVATION_SIGMOID;
break;
case ActivationMode::kTanh:
mode = CUDNN_ACTIVATION_TANH;
break;
default:
PADDLE_THROW("unrecognized activation mode: %d .",
static_cast<int>(activation_mode));
}
CUDNN_ENFORCE(dynload::cudnnSetActivationDescriptor(
desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
return desc_;
}
private:
cudnnActivationDescriptor_t desc_;
DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
};
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
......
...@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
...@@ -18,6 +17,7 @@ limitations under the License. */ ...@@ -18,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -120,11 +120,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -120,11 +120,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
void* allocate(size_t num_bytes) const override { void* allocate(size_t num_bytes) const override {
return paddle::memory::Alloc(place_, num_bytes); auto buf = paddle::memory::Alloc(place_, num_bytes,
memory::Allocator::kScratchpad);
void* retv = buf->ptr();
allocations_[buf->ptr()] = std::move(buf);
return retv;
} }
void deallocate(void* buffer) const override { void deallocate(void* buffer) const override {
paddle::memory::Free(place_, buffer); allocations_.erase(allocations_.find(buffer));
} }
void* scratchpad() const override { void* scratchpad() const override {
...@@ -151,37 +155,35 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -151,37 +155,35 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
const cudaDeviceProp* device_prop_; // not owned; const cudaDeviceProp* device_prop_; // not owned;
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
}; };
CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
: workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) { : workspace_(nullptr), stream_(stream), place_(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
} }
CudnnHolder::~CudnnHolder() { CudnnHolder::~CudnnHolder() {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
if (workspace_ != nullptr) {
paddle::memory::Free(place_, workspace_);
}
} }
void CudnnHolder::ReallocateWorkspace(size_t required_workspace_len) { void CudnnHolder::ReallocateWorkspace(size_t required_workspace_len) {
if (required_workspace_len <= workspace_len_) { if (required_workspace_len <= WorkspaceSize()) {
return; return;
} }
if (workspace_ != nullptr) { if (workspace_ != nullptr) {
// Maybe someone is using the current workspace // Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
paddle::memory::Free(place_, workspace_); workspace_.reset();
} }
workspace_ = paddle::memory::Alloc(place_, required_workspace_len); workspace_ = paddle::memory::Alloc(place_, required_workspace_len,
workspace_len_ = required_workspace_len; paddle::memory::Allocator::kScratchpad);
} }
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: place_(place), cudnn_holder_(nullptr) { : place_(place), cudnn_holder_(nullptr) {
SetDeviceId(place_.device); CUDADeviceGuard guard(place_.device);
compute_capability_ = GetCUDAComputeCapability(place_.device); compute_capability_ = GetCUDAComputeCapability(place_.device);
multi_process_ = GetCUDAMultiProcessors(place_.device); multi_process_ = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/memory/malloc.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
...@@ -85,17 +85,32 @@ class CudnnHolder { ...@@ -85,17 +85,32 @@ class CudnnHolder {
template <typename Callback> template <typename Callback>
void RunFuncImpl(Callback&& cudnn_func, size_t required_workspace_len) { void RunFuncImpl(Callback&& cudnn_func, size_t required_workspace_len) {
if (required_workspace_len > workspace_len_) { if (required_workspace_len > WorkspaceSize()) {
ReallocateWorkspace(required_workspace_len); ReallocateWorkspace(required_workspace_len);
} }
cudnn_func(workspace_); cudnn_func(WorkspacePtr());
}
inline void* WorkspacePtr() {
if (workspace_) {
return workspace_->ptr();
} else {
return nullptr;
}
}
inline size_t WorkspaceSize() {
if (workspace_) {
return workspace_->size();
} else {
return 0;
}
} }
std::mutex& Mutex() { return mtx_; } std::mutex& Mutex() { return mtx_; }
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
void* workspace_; memory::AllocationPtr workspace_;
size_t workspace_len_;
const cudaStream_t* stream_; // not owned; const cudaStream_t* stream_; // not owned;
const CUDAPlace place_; const CUDAPlace place_;
......
...@@ -152,14 +152,15 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -152,14 +152,15 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount); \ __macro(cudnnSetConvolutionGroupCount); \
__macro(cudnnSetConvolutionMathType); \ __macro(cudnnSetConvolutionMathType); \
__macro(cudnnCreateCTCLossDescriptor); \ __macro(cudnnConvolutionBiasActivationForward); \
__macro(cudnnDestroyCTCLossDescriptor); \ __macro(cudnnCreateCTCLossDescriptor); \
__macro(cudnnGetCTCLossDescriptor); \ __macro(cudnnDestroyCTCLossDescriptor); \
__macro(cudnnSetCTCLossDescriptor); \ __macro(cudnnGetCTCLossDescriptor); \
__macro(cudnnGetCTCLossWorkspaceSize); \ __macro(cudnnSetCTCLossDescriptor); \
__macro(cudnnGetCTCLossWorkspaceSize); \
__macro(cudnnCTCLoss); __macro(cudnnCTCLoss);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
......
...@@ -19,6 +19,9 @@ limitations under the License. */ ...@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -64,7 +67,7 @@ void InitP2P(std::vector<int> devices) { ...@@ -64,7 +67,7 @@ void InitP2P(std::vector<int> devices) {
LOG(WARNING) << "Cannot enable P2P access from " << devices[i] LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
<< " to " << devices[j]; << " to " << devices[j];
} else { } else {
cudaSetDevice(devices[i]); platform::CUDADeviceGuard guard(devices[i]);
cudaDeviceEnablePeerAccess(devices[j], 0); cudaDeviceEnablePeerAccess(devices[j], 0);
} }
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <memory>
#include <mutex> // NOLINT
namespace paddle {
namespace platform {
/**
* LockGuard for std::unique_ptr<LockType>. It will do nothing when guarded ptr
* is nullptr.
*
* The advantage of using `LockGuardPtr` instead of
* std::unique<std::lock_guard<lock_type>> is this type is totally a stack
* variable. There is no heap allocation at all.
*/
template <typename LockType>
class LockGuardPtr {
public:
explicit LockGuardPtr(std::unique_ptr<LockType>& lock_ptr) // NOLINT
: lock_(lock_ptr.get()) {
if (lock_) {
lock_->lock();
}
}
~LockGuardPtr() {
if (lock_) {
lock_->unlock();
}
}
LockGuardPtr(const LockGuardPtr&) = delete;
LockGuardPtr& operator=(const LockGuardPtr&) = delete;
LockGuardPtr(LockGuardPtr&&) = delete;
LockGuardPtr& operator=(LockGuardPtr&&) = delete;
private:
LockType* lock_;
};
} // namespace platform
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -18,8 +18,6 @@ limitations under the License. */ ...@@ -18,8 +18,6 @@ limitations under the License. */
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace {
template <typename T> template <typename T>
class Scale { class Scale {
public: public:
...@@ -36,10 +34,7 @@ class Multiply { ...@@ -36,10 +34,7 @@ class Multiply {
HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
}; };
} // namespace
using paddle::memory::Alloc; using paddle::memory::Alloc;
using paddle::memory::Free;
using paddle::memory::Copy; using paddle::memory::Copy;
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
...@@ -63,13 +58,13 @@ TEST(Transform, GPUUnary) { ...@@ -63,13 +58,13 @@ TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<CUDADeviceContext> trans; Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
} }
...@@ -89,13 +84,13 @@ TEST(Transform, GPUBinary) { ...@@ -89,13 +84,13 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<CUDADeviceContext> trans; Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
} }
......
...@@ -41,6 +41,7 @@ limitations under the License. */ ...@@ -41,6 +41,7 @@ limitations under the License. */
#include <boost/any.hpp> #include <boost/any.hpp>
#include <boost/mpl/comparison.hpp> #include <boost/mpl/comparison.hpp>
#include <boost/mpl/less_equal.hpp> #include <boost/mpl/less_equal.hpp>
#include <boost/optional.hpp>
#include <boost/variant.hpp> #include <boost/variant.hpp>
// some platform-independent defintion // some platform-independent defintion
......
...@@ -43,6 +43,7 @@ limitations under the License. */ ...@@ -43,6 +43,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -94,6 +95,7 @@ bool IsCompiledWithDIST() { ...@@ -94,6 +95,7 @@ bool IsCompiledWithDIST() {
} }
PYBIND11_PLUGIN(core) { PYBIND11_PLUGIN(core) {
paddle::memory::allocation::UseAllocatorStrategyGFlag();
py::module m("core", "C++ core of PaddlePaddle"); py::module m("core", "C++ core of PaddlePaddle");
// using framework in this function. Since it is inside a function, it will // using framework in this function. Since it is inside a function, it will
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "pybind11/common.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -57,7 +58,8 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -57,7 +58,8 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
prod *= dims_outside[i - 1]; prod *= dims_outside[i - 1];
} }
framework::Tensor dst_tensor; framework::Tensor dst_tensor;
if (paddle::platform::is_gpu_place(tensor.place())) { bool is_gpu = paddle::platform::is_gpu_place(tensor.place());
if (is_gpu) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>()); auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
...@@ -73,16 +75,44 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -73,16 +75,44 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
dst_tensor = tensor; dst_tensor = tensor;
} }
if (std::type_index(typeid(CUR_TYPE)) == std::string dtype = std::type_index(typeid(CUR_TYPE)) ==
std::type_index(typeid(platform::float16))) { std::type_index(typeid(platform::float16))
return pybind11::buffer_info( ? std::string("e") // np.dtype('e') == np.float16
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), : pybind11::format_descriptor<CUR_TYPE>::format();
"e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); if (is_gpu) {
// manually construct a py_buffer if is_gpu since gpu data is copied
// into CPU.
// TODO(yy): Is these following code memleak?
Py_buffer *py_buffer =
reinterpret_cast<Py_buffer *>(malloc(sizeof(Py_buffer)));
py_buffer->format = strdup(dtype.c_str());
py_buffer->itemsize = sizeof(CUR_TYPE);
py_buffer->ndim = framework::arity(dst_tensor.dims());
py_buffer->len = tensor.numel();
py_buffer->strides = reinterpret_cast<Py_ssize_t *>(
malloc(sizeof(Py_ssize_t) * strides.size()));
for (size_t i = 0; i < strides.size(); ++i) {
py_buffer->strides[i] = strides[i];
}
py_buffer->shape = reinterpret_cast<Py_ssize_t *>(
malloc(sizeof(Py_ssize_t) * tensor.dims().size()));
for (int i = 0; i < tensor.dims().size(); ++i) {
py_buffer->shape[i] = tensor.dims()[i];
}
py_buffer->readonly = false;
py_buffer->suboffsets = nullptr;
py_buffer->obj = nullptr;
py_buffer->buf =
malloc(static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
memcpy(py_buffer->buf, dst_tensor.data<CUR_TYPE>(),
static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
return pybind11::buffer_info(py_buffer, true);
} else { } else {
return pybind11::buffer_info( return pybind11::buffer_info(
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), dtype,
pybind11::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
} }
} else { } else {
...@@ -112,17 +142,16 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -112,17 +142,16 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
} }
} }
// TODO(dzhwinter) : fix the redundent Tensor allocate and free // TODO(dzhwinter) : fix the redundant Tensor allocate and free
template <typename T> template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) { if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); framework::Tensor dst;
framework::TensorCopySync(*self, platform::CPUPlace(), dst.get()); framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
dst->data<T>()[offset] = elem; dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
framework::TensorCopySync(*dst.get(), self->place(), self); framework::TensorCopySync(dst, self->place(), self);
} else if (platform::is_cpu_place(self->place())) { } else if (platform::is_cpu_place(self->place())) {
self->data<T>()[offset] = elem; self->mutable_data<T>(self->place())[offset] = elem;
} }
} }
......
...@@ -16,10 +16,12 @@ limitations under the License. */ ...@@ -16,10 +16,12 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
paddle::memory::allocation::UseAllocatorStrategyGFlag();
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
std::vector<char*> new_argv; std::vector<char*> new_argv;
std::string gflags_env; std::string gflags_env;
...@@ -28,21 +30,16 @@ int main(int argc, char** argv) { ...@@ -28,21 +30,16 @@ int main(int argc, char** argv) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
new_argv.push_back( new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
#else #else
new_argv.push_back(strdup( new_argv.push_back(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb")); strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"));
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
#endif #endif
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::CUDAPlace(0));
#endif
paddle::framework::InitDevices(true); paddle::framework::InitDevices(true);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
...@@ -78,7 +78,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -78,7 +78,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
six.iteritems(word_dict), key=lambda x: x[1], six.iteritems(word_dict), key=lambda x: x[1],
reverse=True)): reverse=True)):
if idx + 3 == dict_size: break if idx + 3 == dict_size: break
fout.write("%s\n" % (cpt.to_bytes(word[0]))) fout.write(word[0].encode('utf-8'))
fout.write('\n')
def __load_dict(tar_file, dict_size, lang, reverse=False): def __load_dict(tar_file, dict_size, lang, reverse=False):
......
...@@ -115,8 +115,9 @@ def __bootstrap__(): ...@@ -115,8 +115,9 @@ def __bootstrap__():
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
'dist_threadpool_size', 'eager_delete_tensor_gb', "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
'reader_queue_speed_test_mode' 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir'
] ]
if os.name != 'nt': if os.name != 'nt':
read_env_flags.append('warpctc_dir') read_env_flags.append('warpctc_dir')
......
...@@ -283,11 +283,7 @@ def detection_output(loc, ...@@ -283,11 +283,7 @@ def detection_output(loc,
prior_box_var=prior_box_var, prior_box_var=prior_box_var,
target_box=loc, target_box=loc,
code_type='decode_center_size') code_type='decode_center_size')
compile_shape = scores.shape
run_shape = nn.shape(scores)
scores = nn.flatten(x=scores, axis=2)
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True scores.stop_gradient = True
nmsed_outs = helper.create_variable_for_type_inference( nmsed_outs = helper.create_variable_for_type_inference(
......
...@@ -6949,8 +6949,15 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): ...@@ -6949,8 +6949,15 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
t_max(${t_max_type}|24.0): ${t_max_comment} t_max(${t_max_type}|24.0): ${t_max_comment}
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
""" """
helper = LayerHelper('brelu', **locals()) helper = LayerHelper('brelu', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
...@@ -6972,8 +6979,15 @@ def leaky_relu(x, alpha=0.02, name=None): ...@@ -6972,8 +6979,15 @@ def leaky_relu(x, alpha=0.02, name=None):
alpha(${alpha_type}|0.02): ${alpha_comment} alpha(${alpha_type}|0.02): ${alpha_comment}
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
y = fluid.layers.leaky_relu(x, alpha=0.01)
""" """
helper = LayerHelper('leaky_relu', **locals()) helper = LayerHelper('leaky_relu', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
...@@ -6994,8 +7008,15 @@ def soft_relu(x, threshold=40.0, name=None): ...@@ -6994,8 +7008,15 @@ def soft_relu(x, threshold=40.0, name=None):
threshold(${threshold_type}|40.0): ${threshold_comment} threshold(${threshold_type}|40.0): ${threshold_comment}
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
y = fluid.layers.soft_relu(x, threshold=20.0)
""" """
helper = LayerHelper('soft_relu', **locals()) helper = LayerHelper('soft_relu', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_conv2d_op import conv2d_forward_naive
class TestConv2dFusionOp(OpTest):
def setUp(self):
self.op_type = "conv2d_fusion"
self.exhaustive_search = False
self.data_format = "AnyLayout"
self.dtype = np.float32
self.activation = 'relu'
self.add_bias = True
self.add_residual_data = True
self.init_group()
self.init_dilation()
self.init_test_case()
self.init_bias_residual()
self.init_activation()
self.set_search_method()
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype(self.dtype)
self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
if self.add_residual_data:
residual_data = np.random.random(output.shape).astype(self.dtype)
self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
residual_data)
output += residual_data
if self.add_bias:
bias = np.random.random(self.filter_size[0]).astype(self.dtype)
self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
output = output + bias.reshape((1, bias.size, 1, 1))
assert self.activation in ['relu', 'identity']
if self.activation == 'relu':
output = np.maximum(output, 0)
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search,
'activation': self.activation
}
self.outputs = {'Output': output}
def testcuda(self):
return core.is_compiled_with_cuda()
def test_check_output(self):
if self.testcuda():
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
else:
pass
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
def init_dilation(self):
self.dilations = [1, 1]
def init_group(self):
self.groups = 1
def init_bias_residual(self):
self.add_bias = True
self.add_residual_data = True
def init_activation(self):
self.activation = 'relu'
def set_search_method(self):
self.exhaustive_search = False
class TestWithoutResidual(TestConv2dFusionOp):
def init_bias_residual(self):
self.add_residual_data = False
class TestIdentityActivation(TestConv2dFusionOp):
def init_activation(self):
self.activation = 'identity'
class TestWithGroup(TestConv2dFusionOp):
def init_group(self):
self.groups = 3
class TestWithDilation(TestConv2dFusionOp):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 10, 10] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
def init_dilation(self):
self.dilations = [2, 2]
def init_group(self):
self.groups = 3
class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
def set_search_method(self):
self.exhaustive_search = True
if __name__ == '__main__':
unittest.main()
...@@ -117,7 +117,7 @@ class TestConv2dOp(OpTest): ...@@ -117,7 +117,7 @@ class TestConv2dOp(OpTest):
return return
place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace() place = core.CUDAPlace(0) if self.testcuda() else core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(
place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02) place, {'Input', 'Filter'}, 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
......
...@@ -116,7 +116,7 @@ class TestDataBalance(unittest.TestCase): ...@@ -116,7 +116,7 @@ class TestDataBalance(unittest.TestCase):
print("WARNING: Unittest TestDataBalance skipped. \ print("WARNING: Unittest TestDataBalance skipped. \
For the result is not correct when device count \ For the result is not correct when device count \
is larger than batch size.") is larger than batch size.")
exit(0) return
fetch_list = [image.name, label.name] fetch_list = [image.name, label.name]
data_appeared = [False] * self.total_ins_num data_appeared = [False] * self.total_ins_num
......
...@@ -42,11 +42,12 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -42,11 +42,12 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._enforce_place = "CPU" self._enforce_place = "CPU"
def no_test_simnet_bow(self): #FIXME(typhoonzero): fix async tests later
def notest_simnet_bow(self):
need_envs = { need_envs = {
"IS_DISTRIBUTED": '0', "IS_DISTRIBUTED": '0',
"IS_SPARSE": '0', "IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1' 'IS_SELF_CONTAINED_LR': '1',
} }
self.check_with_place( self.check_with_place(
"dist_simnet_bow.py", "dist_simnet_bow.py",
......
...@@ -72,7 +72,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -72,7 +72,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
sorted( sorted(
word_dict.iteritems(), key=lambda x: x[1], reverse=True)): word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
if idx + 3 == dict_size: break if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0])) fout.write(word[0].encode('utf-8'))
fout.write('\n')
def __load_dict(tar_file, dict_size, lang, reverse=False): def __load_dict(tar_file, dict_size, lang, reverse=False):
...@@ -300,8 +301,10 @@ def get_dict(lang, dict_size, reverse=False): ...@@ -300,8 +301,10 @@ def get_dict(lang, dict_size, reverse=False):
dict: The word dictionary for the specific language. dict: The word dictionary for the specific language.
""" """
if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) if lang == "en":
else: dict_size = min(dict_size, TOTAL_DE_WORDS) dict_size = min(dict_size, TOTAL_EN_WORDS)
else:
dict_size = min(dict_size, TOTAL_DE_WORDS)
dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size)) "wmt16/%s_%d.dict" % (lang, dict_size))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册