提交 9e3155e0 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_mnist

test=develop
......@@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
RUN pip3 --no-cache-dir install -U wheel && \
pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 --no-cache-dir install -U wheel && \
pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 --no-cache-dir install -U wheel && \
pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \
pip install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
pip --no-cache-dir install -U pip setuptools wheel && \
pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 --no-cache-dir install opencv-python && \
pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 --no-cache-dir install opencv-python && \
pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 --no-cache-dir install opencv-python && \
pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip --no-cache-dir install opencv-python
#For docstring checker
RUN pip3 install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install pylint pytest astroid isort
RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt
RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
RUN pip --no-cache-dir install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev
RUN pip3 install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure]
RUN pip install certifi urllib3[secure]
RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
RUN pip3 --no-cache-dir install certifi urllib3[secure]
RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
......
......@@ -106,10 +106,10 @@ else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn)
DEPENDS mkldnn shared_mkldnn)
endif(WIN32)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF()
......
......@@ -136,7 +136,7 @@ if (WITH_MKLDNN)
copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn
DEPS mkldnn_shared_lib
)
endif ()
......
......@@ -68,18 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function)
DEPS operator op_registry device_context math_function scope)
if(WITH_GPU)
if (WIN32)
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
......
......@@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name);
framework::ToTypeName(var->Type()), name);
}
}
......
......@@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
#endif
int GetVarDeviceID(
const ir::Graph &graph, const std::string &varname,
const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const;
......@@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const;
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const;
std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const;
......@@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int dev_id) const;
int GetOpDeviceID(
const ir::Graph &graph, ir::Node *node,
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
......@@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_;
......
......@@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>());
} else {
PADDLE_THROW("Not supported type %s", var->Type().name());
PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
}
}
......@@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>());
} else {
PADDLE_THROW("Not supported type %s", var.Type().name());
PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
}
}
......
......@@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
var->Type().name(), name);
framework::ToTypeName(var->Type()), name);
}
}
}
......
......@@ -45,6 +45,7 @@ pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
#include <functional>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */ \
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
/* CONV inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* Affine Channel inputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \
/* Affine channel outputs */ \
GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
const ir::Node& ac_scale,
const LoDTensor& ac_bias_tensor,
LoDTensor* eltwise_y_in_tensor) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from AffineChannel
PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
scale_tensor->numel(), 1);
ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
ac_bias_tensor.numel(), 1);
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
// Re-compute weight of conv2d from AffineChannel
auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
auto weights_shape = weights->dims();
auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
EigenMatrixArrayMap weights_array_2d(
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= scale_array;
}
std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvAffineChannel fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+affinechannel fuse";
return;
}
// Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in"));
eltwise_y_in_desc.SetPersistable(true);
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
// Get affine_channel bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
// Initialize eltwise_y
eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 0.0f);
// update weights and biases
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
name_scope_);
conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
int found_conv_ac_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBN fuse";
GET_CONV_BN_NODES(conv_ac_pattern);
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
// Get eltwise_y (conv bias) variable
auto* eltwise_y_in_tensor =
scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
// Get batch norm bias
auto* ac_bias_tensor =
scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
eltwise_y_in_tensor);
// Update the elementwise_add node
eltwise->Op()->SetAttr("axis", 1);
eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
GraphSafeRemoveNodes(graph.get(),
{ac_scale, ac_bias, affine_channel, eltwise_out});
IR_NODE_LINK_TO(eltwise, ac_out);
found_conv_ac_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_ac_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_affine_channel_fuse_pass,
paddle::framework::ir::ConvAffineChannelFusePass);
REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and ConvAffineChannel.
*/
class ConvAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
public:
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -23,66 +23,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
namespace ir {
namespace {
void CheckProgram(const ProgramDesc &program) {
#define _INT(role) static_cast<int>(role)
std::map<int, bool> visit;
for (OpDesc *op : program.Block(0).AllOps()) {
// For backward compatibility, some program doesn't have role added.
if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
int role_id =
boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
visit[role_id] = true;
switch (role_id) {
case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR) << "Cannot add backward operator before forward operator "
<< op->Type();
}
break;
case _INT(OpRole::kBackward):
case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add backward operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
_INT(OpRole::kLoss)) == visit.end(),
"Cannot add backward|loss operator before "
"forward|loss operator %s.",
op->Type());
PADDLE_ENFORCE(
visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add forward|loss operator %s after optimize operator.",
op->Type());
break;
case _INT(OpRole::kOptimize):
case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
"Optimize operators %s must follow backward operator.",
op->Type());
break;
case _INT(OpRole::kLRSched):
case _INT(OpRole::kDist):
case _INT(OpRole::kRPC):
case _INT(OpRole::kNotSpecified):
break;
default:
LOG(FATAL) << "Unknown operator role. Don't add new role because "
"you don't know what you are doing.";
}
}
#undef _INT
}
} // namespace
Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_);
auto var_nodes = InitFromProgram(program_);
ResolveHazard(var_nodes);
}
......
......@@ -1234,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
return elementwise_add_out;
}
PDNode *patterns::ConvAffineChannel::operator()(
paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
PDNode *eltwise_op = nullptr;
if (with_eltwise_add) {
eltwise_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
}
auto *affine_channel_op =
pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
// Create variables
// Conv Filter
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d");
PDNode *eltwise_y_in_var = nullptr;
PDNode *eltwise_out_var = nullptr;
if (with_eltwise_add) {
// Conv output as Bias input
conv_out_var->assert_is_op_input("elementwise_add", "X");
// Bias
eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
eltwise_out_var = pattern->NewNode(eltwise_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("elementwise_add");
} else {
// Conv output as AffineChannel input
conv_out_var->assert_is_op_input("affine_channel", "X");
}
// AC Scale
auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Scale");
// AC Bias
auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("affine_channel", "Bias");
// AC output
auto *ac_out_var = pattern->NewNode(ac_out_repr())
->AsOutput()
->assert_is_op_output("affine_channel");
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
if (with_eltwise_add) {
eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
.LinksTo({eltwise_out_var});
affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
} else {
affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
.LinksTo({ac_out_var});
}
return ac_out_var;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
PATTERN_DECL_NODE(elementwise_add_out);
};
// Conv with affine_channel
// op: conv + (elementwise_add +) affine_channel
// named nodes:
// conv_weight, conv_out, conv,
// ac_x, ac_scale, ac_bias
// affine_channel, ac_out
struct ConvAffineChannel : public PatternBase {
ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_affine_channel") {}
PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(affine_channel);
PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD
// CONV inputs
PATTERN_DECL_NODE(conv_weight); // Filter
// CONV outputs
PATTERN_DECL_NODE(conv_out); // tmp
// ELTWISE inputs
PATTERN_DECL_NODE(eltwise_y_in);
// ELTWISE outputs
PATTERN_DECL_NODE(eltwise_out); // tmp
// AC(Affine_Channel) inputs
PATTERN_DECL_NODE(ac_scale);
PATTERN_DECL_NODE(ac_bias);
// AC outputs
PATTERN_DECL_NODE(ac_out); // Out
};
} // namespace patterns
// Link two ir::Nodes from each other.
......
......@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::vector<Node*> optimize_ops;
std::vector<Node*> lr_ops; // ops other than forward/backward/optimize
std::unordered_set<std::string> grad_names;
std::unordered_map<std::string, std::string> gradname2paramname;
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes();
......@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
for (size_t i = 0; i < op_role_vars.size(); i += 2) {
grad_names.insert(op_role_vars[i + 1]);
gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
}
} else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
lr_ops.push_back(node);
......@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
// 2. copy forward backward
ir::Node* prev_repeat_last_op_node = nullptr;
// record origin_grad -> repeated grad list map.
// record origin_grad -> repeated_grad_list map.
std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
std::map<std::string, std::vector<ir::Node*>> created;
std::unordered_set<std::string> bn_vars_need_rename;
......@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
if (grad_names.find(outname) != grad_names.end()) {
std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
repeated_op.RenameOutput(outname, new_gname);
// remove op_role_var for backward ops that outputs grad for a
// parameter.
repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
std::vector<std::string>());
}
}
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update
// not need this update, because only moving mean and variance should be
// differ, trainable parameter scale and bias is the same as other
// parameters.
if (node->Name() == "batch_norm") {
// NOTE: assume bn op created by layers use save var as output mean and
// variance
......@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
var->inputs.push_back(repeated_node);
}
}
}
} // end copy forward backward
// 5. create GRAD merge op node
// 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
// scale(1/num_repeats)
for (auto kv : grad_repeated_map) {
OpDesc sum_op;
sum_op.SetType("sum");
std::vector<std::string> repeated_grad_names;
std::vector<std::string> param_grad_op_role_var;
for (auto r : kv.second) {
repeated_grad_names.push_back(r->Var()->Name());
}
// NOTE: use op_role_var to control allreduce op appending in
// multi_devices_graph_pass, we want to append op_role_var
// only once for the merged gradient, so break after first call.
param_grad_op_role_var.push_back(
gradname2paramname.at(kv.first->Var()->Name())); // param
param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad
sum_op.SetInput("X", repeated_grad_names);
sum_op.SetOutput("Out", {kv.first->Var()->Name()});
sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
......@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
param_grad_op_role_var);
auto scale_op_node = result.CreateOpNode(&scale_op);
scale_op_node->inputs.push_back(sum_out_var_node);
sum_out_var_node->outputs.push_back(scale_op_node);
......
......@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <algorithm>
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -380,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
return &(var.Get<SelectedRows>().value());
} else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var.Type().name());
ToTypeName(var.Type()));
}
}
......@@ -391,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
return var->GetMutable<SelectedRows>()->mutable_value();
} else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name());
ToTypeName(var->Type()));
}
}
......@@ -485,7 +484,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
PADDLE_ENFORCE(
var->IsType<LoDTensor>(),
"should be LoDTensor, but the received type is %s",
var->Type().name());
ToTypeName(var->Type()));
return &(var->Get<LoDTensor>());
});
return res;
......@@ -504,7 +503,7 @@ const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
PADDLE_ENFORCE(
var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s",
sub_name, var->Type().name());
sub_name, ToTypeName(var->Type()));
return &(var->Get<LoDTensor>());
});
return res;
......@@ -533,7 +532,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
PADDLE_ENFORCE(
var->IsType<LoDTensor>(),
"%s should be LoDTensor, but the received type is %s",
sub_name, var->Type().name());
sub_name, ToTypeName(var->Type()));
return var->GetMutable<LoDTensor>();
});
return res;
......@@ -775,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
PADDLE_THROW(
"Only LoDTensor/SelectedRows support 'GetDim', but Variables "
"type_id is %s.",
var->Type().name());
ToTypeName(var->Type()));
}
}
......@@ -798,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
var->GetMutable<SelectedRows>()->set_height(dim[0]);
} else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name());
ToTypeName(var->Type()));
}
}
......@@ -1041,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
auto& scope = ctx.scope();
int data_type = -1;
std::string last_input_name;
for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) {
auto* var = scope.FindVar(ipt_name);
const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
for (size_t i = 0; i < vars.size(); ++i) {
const Variable* var = vars[i];
if (var != nullptr) {
const Tensor* t = nullptr;
if (var->IsType<Tensor>()) {
......@@ -1057,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &(var->Get<SelectedRows>().value());
}
if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
ipt_name);
PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
input.first, i);
int tmp = static_cast<int>(t->type());
PADDLE_ENFORCE(
tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
Type(), last_input_name, data_type, ipt_name, tmp);
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), data_type, tmp);
data_type = tmp;
last_input_name = ipt_name;
}
}
}
......
......@@ -81,6 +81,10 @@ class RuntimeContext {
RuntimeContext(const VariableNameMap& innames,
const VariableNameMap& outnames, const Scope& scope);
RuntimeContext(const VariableValueMap& invars,
const VariableValueMap& outvars)
: inputs(invars), outputs(outvars) {}
VariableValueMap inputs;
VariableValueMap outputs;
};
......@@ -447,8 +451,9 @@ class OperatorWithKernel : public OperatorBase {
void RuntimeInferShape(const Scope& scope, const platform::Place& place,
const RuntimeContext& ctx) const override;
protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
protected:
virtual OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const;
......
......@@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices(
if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std::vector<void *> buffers;
buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
for (size_t i = 0; i < member_->places_.size(); ++i) {
......@@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices(
#endif
} else {
platform::CPUPlace cpu;
for (size_t i = 0; i < member_->places_.size(); ++i) {
if (i == 0) continue;
for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
......
......@@ -165,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const {
Variable* Scope::VarInternal(const std::string& name) {
auto* v = FindVarLocally(name);
if (v != nullptr) return v;
v = new Variable();
vars_[name].reset(v);
vars_.emplace(name, std::unique_ptr<Variable>(v));
VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v;
}
......
......@@ -19,52 +19,50 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
template <typename T>
inline bool IsType(const std::type_index& type_index) {
return type_index == std::type_index(typeid(T));
inline bool IsType(const std::type_index& type) {
return type == typeid(T);
}
inline proto::VarType::Type ToVarType(std::type_index type) {
if (IsType<LoDTensor>(type)) {
return proto::VarType_Type_LOD_TENSOR;
} else if (IsType<LoDRankTable>(type)) {
return proto::VarType_Type_LOD_RANK_TABLE;
} else if (IsType<LoDTensorArray>(type)) {
return proto::VarType_Type_LOD_TENSOR_ARRAY;
} else if (IsType<SelectedRows>(type)) {
return proto::VarType_Type_SELECTED_ROWS;
} else if (IsType<ReaderHolder>(type)) {
return proto::VarType_Type_READER;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
inline proto::VarType::Type ToVarType(int type) {
switch (type) {
case proto::VarType::LOD_TENSOR:
case proto::VarType::SELECTED_ROWS:
case proto::VarType::LOD_RANK_TABLE:
case proto::VarType::LOD_TENSOR_ARRAY:
case proto::VarType::READER:
return static_cast<proto::VarType::Type>(type);
default:
PADDLE_THROW("ToVarType:Unsupported type %d", type);
}
}
template <typename Visitor>
inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) {
case proto::VarType_Type_LOD_TENSOR:
switch (var.Type()) {
case proto::VarType::LOD_TENSOR:
visitor(var.Get<LoDTensor>());
return;
case proto::VarType_Type_LOD_RANK_TABLE:
case proto::VarType::LOD_RANK_TABLE:
visitor(var.Get<LoDRankTable>());
return;
case proto::VarType_Type_LOD_TENSOR_ARRAY:
case proto::VarType::LOD_TENSOR_ARRAY:
visitor(var.Get<LoDTensorArray>());
return;
case proto::VarType_Type_SELECTED_ROWS:
case proto::VarType::SELECTED_ROWS:
visitor(var.Get<SelectedRows>());
return;
case proto::VarType_Type_READER:
case proto::VarType::READER:
visitor(var.Get<ReaderHolder>());
return;
default:
PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
}
}
......
......@@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) {
op->InferVarType(prog.MutableBlock(0));
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR,
ASSERT_EQ(proto::VarType::LOD_TENSOR,
prog.MutableBlock(0)->Var("test2_out")->GetType());
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/macros.h"
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
namespace paddle {
namespace framework {
// Besides registering variable type id, it is helpful to register a
// var_id -> std::type_index map (for example, get type names according to id)
namespace detail {
template <int kStart, int kEnd, bool kStop>
struct VarIdToTypeIndexMapInitializerImpl {
template <typename MapType1, typename MapType2>
static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
using Type =
typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
constexpr int kId = VarTypeTrait<Type>::kId;
auto type = std::type_index(typeid(Type));
PADDLE_ENFORCE(id_to_type->count(kId) == 0,
"Registered duplicate type id %d for type %s", kId,
type.name());
PADDLE_ENFORCE(type_to_id->count(type) == 0,
"Registered duplicate type_index %s for id %d", type.name(),
kId);
id_to_type->emplace(kId, type);
type_to_id->emplace(type, kId);
VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
kStart + 1 == kEnd>::Init(id_to_type,
type_to_id);
}
};
template <int kStart, int kEnd>
struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
template <typename MapType1, typename MapType2>
static void Init(MapType1 *, MapType2 *) {}
};
// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
// std::type_index map and std::type_index -> var_id map
using VarIdToTypeIndexMapInitializer =
VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
VarTypeRegistry::kRegisteredTypeNum ==
0>;
struct VarIdToTypeIndexMapHolder {
DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
public:
static const std::type_index &ToTypeIndex(int var_id) {
auto it = Instance().id_to_type_map_.find(var_id);
PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
"VarId %d is not registered.", var_id);
return it->second;
}
static int ToTypeId(const std::type_index &type) {
auto it = Instance().type_to_id_map_.find(type);
PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
"VarType %s is not registered.", type.name());
return it->second;
}
private:
VarIdToTypeIndexMapHolder() {
VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
}
static const VarIdToTypeIndexMapHolder &Instance() {
static const VarIdToTypeIndexMapHolder instance;
return instance;
}
std::unordered_map<int, std::type_index> id_to_type_map_;
std::unordered_map<std::type_index, int> type_to_id_map_;
};
} // namespace detail
const std::type_index &ToTypeIndex(int var_id) {
return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
}
const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
int ToTypeId(const std::type_index &type) {
return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <tuple>
#include <typeindex>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#ifndef _WIN32
#include <nccl.h>
#endif
#endif
// Users should add forward declarations here
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
class Communicator;
#endif
#endif
} // namespace platform
namespace framework {
class Tensor;
class LoDTensor;
class SelectedRows;
class LoDRankTable;
class ReaderHolder;
class Scope;
} // namespace framework
namespace operators {
template <typename T>
class AlgorithmsCache;
class CudnnRNNCache;
namespace reader {
class LoDTensorBlockingQueueHolder;
} // namespace reader
} // namespace operators
} // namespace paddle
namespace paddle {
namespace framework {
const char *ToTypeName(int var_id);
const std::type_index &ToTypeIndex(int var_id);
int ToTypeId(const std::type_index &type);
namespace detail {
template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
typename... Args>
struct TypePosFinderImpl {
static constexpr int kPos =
std::is_same<T1, T2>::value
? kStart
: TypePosFinderImpl<kStart + 2 == kEnd, kStart + 1, kEnd, T1,
Args...>::kPos;
};
template <int kStart, int kEnd, typename T1, typename T2>
struct TypePosFinderImpl<true, kStart, kEnd, T1, T2> {
static constexpr int kPos = std::is_same<T1, T2>::value ? kStart : -1;
};
// TypePosFinder helps to find the position in which T is inside Args...
// If T is not inside Args..., kPos would be -1
template <typename T, typename... Args>
struct TypePosFinder {
static constexpr int kPos =
TypePosFinderImpl<sizeof...(Args) == 1, 0, sizeof...(Args), T,
Args...>::kPos;
};
template <typename... Args>
struct VarTypeRegistryImpl {
static constexpr size_t kRegisteredTypeNum = sizeof...(Args);
using ArgTuple = std::tuple<Args...>;
// TypePos() returns the position in which T is inside Args...
// If T is not inside Args..., return -1
template <typename T>
static constexpr int TypePos() {
return TypePosFinder<T, Args...>::kPos;
}
// IsRegistered() returns whether T is registered inside RegistryImpl
template <typename T>
static constexpr bool IsRegistered() {
return TypePos<T>() >= 0;
}
};
} // namespace detail
#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \
template <> \
struct VarTypeTrait<type> { \
static_assert(VarTypeRegistry::IsRegistered<type>(), \
"Must be registered type"); \
using Type = type; \
static constexpr int kId = static_cast<int>(proto_id); \
}
/**
* The following codes are designed to register variable types.
* Only registered types can be stored in Variable.
* This registry mechanism is designed to speed up Variable.
*
* Caution: If you want to add more var types, please consider carefully
* whether you really need to add it.
*/
// Users should add other variable types below.
// Paddle would generate unique Ids for each registered variable types.
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
ncclUniqueId, platform::Communicator,
#endif
operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
operators::CudnnRNNCache,
#endif
int, float>;
template <typename T>
struct VarTypeTrait {
static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
using Type = T;
/**
* Unique VarType Id generation.
*
* The auto-generated id should not be the same as any protobuf id defined in
* framework.proto. Therefore, we generate id by adding the type pos and
* maximum protobuf id (i.e., proto::VarType::TUPLE).
*
* However, we may need more protobuf id in the future.
* To avoid changing this auto id generation algorithm frequently, we
* generate id by adding the type pos and twice of maximum protobuf id (i.e.,
* proto::VarType::TUPLE).
*/
static constexpr int kId = VarTypeRegistry::TypePos<T>() +
static_cast<int>(proto::VarType::TUPLE) * 2;
};
// Users should set some of variable type ids to be what is defined in
// framework.proto below
REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
/** End of variable type registration */
template <typename T>
inline constexpr bool IsRegisteredVarType() {
return VarTypeRegistry::IsRegistered<T>();
}
#undef REG_PROTO_VAR_TYPE_TRAIT
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cstdint>
#include <iostream>
#include <unordered_set>
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
namespace paddle {
namespace framework {
template <int kPos, int kEnd, bool kStop>
struct TypeIndexChecker {
template <typename SetType1, typename SetType2>
static void Check(SetType1 *var_id_set, SetType2 *type_index_set) {
using Type =
typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
"Type must be the same");
constexpr auto kId = VarTypeTrait<Type>::kId;
std::type_index actual_type(typeid(Type));
EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
EXPECT_EQ(ToTypeIndex(kId), actual_type);
EXPECT_EQ(ToTypeId(actual_type), kId);
EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT
EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT
var_id_set->insert(kId);
type_index_set->insert(std::type_index(typeid(Type)));
TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
type_index_set);
}
};
template <int kPos, int kEnd>
struct TypeIndexChecker<kPos, kEnd, true> {
template <typename SetType1, typename SetType2>
static void Check(SetType1 *, SetType2 *) {}
};
TEST(var_type_traits, check_no_duplicate_registry) {
constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
std::unordered_set<int> var_id_set;
std::unordered_set<std::type_index> type_index_set;
TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(
&var_id_set, &type_index_set);
}
template <typename T>
bool CheckVarId(int proto_id) {
static_assert(std::is_same<typename VarTypeTrait<T>::Type, T>::value,
"Type must be the same");
return VarTypeTrait<T>::kId == proto_id;
}
TEST(var_type_traits, check_proto_type_id) {
ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
ASSERT_TRUE(CheckVarId<int>(proto::VarType::INT32));
ASSERT_TRUE(CheckVarId<float>(proto::VarType::FP32));
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR);
ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS);
ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES);
ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE);
ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY,
proto::VarType::LOD_TENSOR_ARRAY);
ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST);
ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER);
ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH);
ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST);
ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW);
ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE);
ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32);
ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32);
}
TEST(var_type_traits, test_registry) {
using Registry = detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double>;
ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
ASSERT_TRUE(Registry::TypePos<double>() == 3);
ASSERT_TRUE(Registry::TypePos<float>() == -1);
}
} // namespace framework
} // namespace paddle
......@@ -18,7 +18,7 @@
#include <typeindex>
#include <typeinfo>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/var_type_traits.h"
namespace paddle {
namespace framework {
......@@ -27,10 +27,14 @@ class Variable {
public:
template <typename T>
const T& Get() const {
static_assert(
IsRegisteredVarType<T>(),
"Not registered type. Please register T inside var_type_traits.h");
PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
PADDLE_ENFORCE(IsType<T>(),
PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
"Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name());
ToTypeName(VarTypeTrait<T>::kId),
ToTypeName(holder_->Type()));
return *static_cast<const T*>(holder_->Ptr());
}
......@@ -39,61 +43,61 @@ class Variable {
template <typename T>
T* GetMutable() {
if (!holder_) {
holder_.reset(new PlaceholderImpl<T>(new T()));
holder_.reset(new PlaceholderImpl<T>());
} else {
PADDLE_ENFORCE(IsType<T>(),
PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
"Variable must be type %s, the holding type is %s",
typeid(T).name(), holder_->Type().name());
ToTypeName(VarTypeTrait<T>::kId),
ToTypeName(holder_->Type()));
}
return static_cast<T*>(holder_->Ptr());
}
template <typename T>
bool IsType() const {
return holder_ != nullptr &&
std::type_index(typeid(T)) == std::type_index(holder_->Type());
return holder_ && holder_->Type() == VarTypeTrait<T>::kId;
}
void Clear() { holder_.reset(); }
std::type_index Type() const {
int Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type();
}
private:
struct Placeholder {
virtual ~Placeholder() {}
virtual const std::type_info& Type() const = 0;
virtual void* Ptr() const = 0;
virtual ~Placeholder() = default;
inline int Type() const { return type_; }
inline const void* Ptr() const { return ptr_; }
inline void* Ptr() { return ptr_; }
protected:
inline void Init(void* p, int type) {
ptr_ = p;
type_ = type;
}
void* ptr_;
int type_;
};
// Placeholder hides type T, so it doesn't appear as a template
// parameter of Variable.
template <typename T>
struct PlaceholderImpl : public Placeholder {
explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
virtual const std::type_info& Type() const { return type_; }
virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
static_assert(
IsRegisteredVarType<T>(),
"Not registered type. Please register T inside var_type_traits.h");
PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }
std::unique_ptr<T> ptr_;
const std::type_info& type_;
private:
T obj_;
};
std::unique_ptr<Placeholder>
holder_; // pointers to a PlaceholderImpl object indeed.
// name_ is only meaningful with a Scope and accessible by it.
//
// NOTE: Please don't expose name_ by adding methods like
// Variable::Name or Scope::VarName! A variable could have a human
// readable name or an auto-generated scope-unique name. In the
// former case, the caller knows the name and doesn't need to access
// the name; in the latter case, the variable should be identified
// by its address but not the unreadable name.
friend class Scope;
const std::string* name_;
// pointers to a PlaceholderImpl object indeed.
std::unique_ptr<Placeholder> holder_;
};
} // namespace framework
......
......@@ -16,27 +16,28 @@
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
TEST(Variable, GetMutable) {
using paddle::framework::Variable;
struct Tensor {
int content_;
};
namespace paddle {
namespace framework {
TEST(Variable, GetMutable) {
std::unique_ptr<Variable> v(new Variable());
Tensor* t = v->GetMutable<Tensor>();
t->content_ = 1234;
auto* t = v->GetMutable<std::string>();
*t = "1234";
const Tensor& tt = v->Get<Tensor>();
EXPECT_EQ(1234, tt.content_);
const auto& tt = v->Get<std::string>();
EXPECT_EQ("1234", tt);
try {
v->GetMutable<std::string>();
v->GetMutable<Tensor>();
} catch (std::exception& e) {
return;
}
EXPECT_TRUE(false);
}
} // namespace framework
} // namespace paddle
......@@ -42,12 +42,12 @@ void AddTo(Variable* src, Variable* dst) {
class Autograd {
public:
explicit Autograd(framework::Scope* scope) : scope_(scope) {}
Autograd() {}
void RunBackward(VarBase* var) {
PADDLE_ENFORCE(var->pre_op_->op_desc_);
// TODO(panyx0718): Only create for vars that "require_grad"
(*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
if (var->stop_gradient_) {
return;
}
std::deque<OpBase*> ready;
ready.push_back(var->pre_op_);
......@@ -57,21 +57,25 @@ class Autograd {
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
for (size_t i = 0; i < input_grads.size(); ++i) {
if (!input_grads[i]) continue;
if (ready_op->input_vars_->at(i)->stop_gradient_) {
continue;
}
OpBase* pre_op = ready_op->pre_ops_->at(i);
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
std::map<std::string, std::vector<VarBase*>> input_grads =
ready_op->ApplyGrad();
for (auto it : input_grads) {
const std::vector<VarBase*>& ingrads = it.second;
for (size_t i = 0; i < ingrads.size(); ++i) {
if (!ingrads[i]) continue;
if (ready_op->input_vars_[it.first][i]->stop_gradient_) {
continue;
}
OpBase* pre_op = ready_op->pre_ops_[it.first][i];
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
}
}
}
}
......@@ -88,92 +92,41 @@ class Autograd {
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
for (OpBase* pre_op : *(candidate->pre_ops_)) {
if (!pre_op) continue;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
}
ret[pre_op] += 1;
}
ret[pre_op] += 1;
}
}
return ret;
}
framework::Scope* scope_;
};
framework::Variable* CreateVariable(const std::string& name,
const framework::DDim& dim, float val,
framework::Scope* scope,
bool random_name = false) {
std::string varname = name;
if (random_name) {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int>::max());
int id = dist6(rng);
varname = string::Sprintf("%s@%d", varname, id);
}
VLOG(3) << "creating var " << varname;
framework::Variable* var = scope->Var(varname);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
std::fill(data, data + tensor->numel(), val);
return var;
}
framework::LoDTensor& VarBase::Grad() {
VLOG(3) << "get var grad " << var_desc_->Name();
return *grads_->GetMutable<framework::LoDTensor>();
}
void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
PADDLE_ENFORCE(grad->IsInitialized(), "grad %s must be initialized",
var_desc_->Name());
PADDLE_ENFORCE(grad->Get<framework::LoDTensor>().IsInitialized(),
"variable %s has NO gradient, please set stop_gradient to it",
var_desc_->Name());
VLOG(3) << "apply var grad " << var_desc_->Name() << " "
<< grad->Get<framework::LoDTensor>().data<float>()[0];
if (!grads_) {
grads_ =
CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_) {
VLOG(3) << "op with no grad: " << op_desc_->Type();
return {};
}
AddTo(grad, grads_);
VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
<< grads_->Get<framework::LoDTensor>().data<float>()[0];
}
std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
VLOG(3) << "op grad type: " << grad_op_desc_->Type();
for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
// grad op inputs can be forward inputs, so not in grad_to_var.
continue;
}
VLOG(3) << "op grad input var " << grad_invar;
framework::VarDesc& grad_invar_desc =
block_->FindRecursiveOrCreateVar(grad_invar);
framework::Variable* var = scope->Var(grad_invar);
const std::string& invar = grad_to_var_->at(grad_invar);
for (VarBase* varbase : *output_vars_) {
// Use the accumulated grads_ by sharing the input with grads_.
if (varbase->var_desc_->Name() == invar) {
var->GetMutable<framework::LoDTensor>()->ShareDataWith(
varbase->grads_->Get<framework::LoDTensor>());
break;
}
VLOG(3) << "op grad " << grad_op_desc_->Type();
std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
for (size_t i = 0; i < it.second.size(); ++i) {
tmp_vars.emplace_back(new framework::Variable());
outputs.push_back(tmp_vars.back().get());
outputs.back()->GetMutable<framework::LoDTensor>();
}
grad_invar_desc.SetShape(
framework::vectorize(var->Get<framework::LoDTensor>().dims()));
......@@ -182,62 +135,46 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
<< framework::vectorize(var->Get<framework::LoDTensor>().dims()).size();
}
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
VLOG(3) << "op grad output var " << outvar;
block_->FindRecursiveOrCreateVar(outvar);
framework::Variable* var = scope->Var(outvar);
if (!var->IsInitialized()) {
VLOG(3) << "init op grad output var " << outvar;
framework::VarDesc* var_desc = block_->FindVar(outvar);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
grad_op_desc_->InferShape(*block_);
// No need to do static infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
opbase->Run(*scope, platform::CPUPlace());
// `ret` matches exactly with `input_vars_` of forward op.
std::vector<Variable*> ret;
for (size_t i = 0; i < input_vars_->size(); ++i) {
bool found = false;
VarBase* origin_var = (*input_vars_)[i];
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
Variable* var = scope->FindVar(outvar);
std::string orig_var_name = grad_to_var_->at(outvar);
if (origin_var->var_desc_->Name() != orig_var_name ||
origin_var->stop_gradient_) {
continue;
}
VLOG(3) << "apply grad " << outvar << " with origin " << orig_var_name;
origin_var->ApplyGrad(scope, var);
found = true;
ret.push_back(var);
// TODO(panyx0718): There might be another outvar with the same name.
// In that case, it doesn't matter the first one or the second one is
// used.
break;
}
if (!found) {
ret.push_back(nullptr);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
auto& origin_outputs = it.second;
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* orig_grad = origin_outputs[i];
AddTo(outputs[i], orig_grad);
}
}
return ret;
return input_vars_;
}
void VarBase::RunBackward(framework::Scope* scope) {
grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
false);
void VarBase::RunBackward() {
if (!pre_op_) return;
Autograd(scope).RunBackward(this);
auto grads_t = grads_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace());
std::fill(data, data + grads_t->numel(), 1.0);
PADDLE_ENFORCE(
grads_ ==
pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
Autograd().RunBackward(this);
}
} // namespace imperative
......
......@@ -14,17 +14,69 @@
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace imperative {
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx)
: op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = op.GetExpectedKernelType(
framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
}
const framework::OperatorBase& op;
const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx;
};
class OpBase;
class VarBase {
......@@ -33,15 +85,22 @@ class VarBase {
: pre_op_(nullptr),
pre_op_out_idx_(-1),
var_desc_(nullptr),
var_(nullptr),
grads_(nullptr),
var_(new framework::Variable()),
grads_(new framework::Variable()),
stop_gradient_(stop_gradient) {}
virtual ~VarBase() {}
void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
virtual ~VarBase() {
if (var_) {
delete var_;
var_ = nullptr;
}
if (grads_) {
delete grads_;
grads_ = nullptr;
}
}
void RunBackward(framework::Scope* scope);
void RunBackward();
framework::LoDTensor& Grad();
......@@ -55,6 +114,7 @@ class VarBase {
}
OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_;
framework::VarDesc* var_desc_;
......@@ -66,36 +126,24 @@ class VarBase {
class OpBase {
public:
OpBase()
: input_vars_(new std::vector<VarBase*>()),
output_vars_(new std::vector<VarBase*>()),
pre_ops_(new std::vector<OpBase*>()),
pre_ops_out_idx_(new std::vector<int>()),
op_desc_(nullptr),
grad_op_desc_(nullptr),
grad_to_var_(nullptr) {}
OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
virtual ~OpBase() {
delete input_vars_;
delete output_vars_;
delete pre_ops_;
delete pre_ops_out_idx_;
if (grad_op_desc_) delete grad_op_desc_;
if (grad_to_var_) delete grad_to_var_;
}
std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
std::map<std::string, std::vector<VarBase*>> ApplyGrad();
std::vector<VarBase*>* input_vars_;
std::vector<VarBase*>* output_vars_;
std::vector<OpBase*>* pre_ops_;
std::vector<int>* pre_ops_out_idx_;
framework::OpDesc* op_desc_;
framework::OpDesc* grad_op_desc_;
std::unordered_map<std::string, std::string>* grad_to_var_;
std::map<std::string, std::vector<VarBase*>> input_vars_;
std::map<std::string, std::vector<VarBase*>> output_vars_;
std::map<std::string, std::vector<OpBase*>> pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_;
std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
framework::BlockDesc* block_;
};
......
......@@ -20,7 +20,6 @@
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h"
......@@ -41,16 +40,26 @@ void CreateGradOp(const framework::OpDesc& op_desc,
*grad_op_desc = grad_op_descs[0].release();
}
void InitVar(framework::Variable* var, framework::Variable* grad_var) {
auto& var_t = var->Get<framework::LoDTensor>();
float* data =
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
var_t.dims(), platform::CPUPlace());
std::fill(data, data + var_t.numel(), 0.0);
}
class Tracer {
public:
explicit Tracer(framework::BlockDesc* root_block)
: root_scope_(new framework::Scope()) {}
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
virtual ~Tracer() {}
void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
const std::vector<VarBase*>& outputs, framework::BlockDesc* block,
const bool stop_gradient) {
void Trace(OpBase* op,
const std::map<std::string, std::vector<VarBase*>>& inputs,
const std::map<std::string, std::vector<VarBase*>>& outputs,
framework::BlockDesc* block) {
std::map<std::string, VarBase*> vars;
framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type();
op_desc->InferShape(*block);
......@@ -58,65 +67,112 @@ class Tracer {
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc);
*op->input_vars_ = inputs;
for (VarBase* input : inputs) {
const std::string vname = input->var_desc_->Name();
framework::Variable* var = root_scope_->Var(vname);
input->var_ = var;
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map;
op->input_vars_ = inputs;
for (auto it : op->input_vars_) {
auto& invars = invars_map[it.first];
for (VarBase* inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
op->op_desc_->Type(), inp->var_desc_->Name());
invars.push_back(inp->var_);
vars[inp->var_desc_->Name()] = inp;
if (inp->pre_op_) {
op->pre_ops_[it.first].push_back(inp->pre_op_);
op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
} else {
LOG(ERROR) << "tracer doesn't support yet";
op->pre_ops_[it.first].push_back(nullptr);
}
VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
<< inp->var_->IsInitialized();
}
if (input->pre_op_) {
op->pre_ops_->push_back(input->pre_op_);
op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
} else {
op->pre_ops_->push_back(nullptr);
}
VLOG(3) << "input vname " << vname << " "
<< var->Get<framework::LoDTensor>().dims().size();
}
*op->output_vars_ = outputs;
for (size_t i = 0; i < outputs.size(); ++i) {
const std::string vname = outputs[i]->var_desc_->Name();
framework::Variable* var = root_scope_->Var(vname);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
op->output_vars_ = outputs;
for (auto it : op->output_vars_) {
auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second;
for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* out = outputs[i];
outvars.push_back(out->var_);
vars[out->var_desc_->Name()] = out;
framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
out->var_->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
out->stop_gradient_ = stop_gradient;
out->pre_op_ = op;
out->pre_op_out_name_ = it.first;
out->pre_op_out_idx_ = i;
outputs[i]->stop_gradient_ = stop_gradient;
outputs[i]->var_ = var;
outputs[i]->pre_op_ = op;
outputs[i]->pre_op_out_idx_ = i;
VLOG(3) << "output vname " << out->var_desc_->Name() << " "
<< out->var_->IsInitialized();
}
}
VLOG(3) << "tracer running " << op_desc->Type();
op_base->Run(*root_scope_, platform::CPUPlace());
framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p.
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
if (!stop_gradient) {
framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
op->grad_op_desc_ = grad_op_desc;
op->grad_to_var_ = grad_to_var;
VLOG(3) << "tracer create grad op " << grad_op_desc->Type();
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[it.first];
for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
grad_in_vars.push_back(fwd_var_it->second->var_);
} else {
VarBase* var = vars[var_it->second];
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
}
grad_in_vars.push_back(var->grads_);
}
}
}
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[it.first];
for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end());
VarBase* var = vars[var_it->second];
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
}
grad_out_vars.push_back(var->grads_);
}
}
}
op->block_ = block;
}
framework::Scope* GetScope() { return root_scope_.get(); }
private:
std::unique_ptr<framework::Scope> root_scope_;
framework::BlockDesc* root_block_;
};
} // namespace imperative
......
......@@ -127,6 +127,7 @@ struct Argument {
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
......
......@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size()));
}
// graph_ = pass->Apply(std::move(graph_));
......
......@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
namespace paddle {
namespace inference {
......@@ -36,7 +38,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
auto teller =
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser();
for (auto *node : graph->Nodes()) {
......@@ -197,10 +200,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
}
}
std::vector<std::string> parameters;
for (const auto &node : nodes) {
if (!node->IsVar()) continue;
if (node->Var()->Persistable()) {
if (node->Var()->Persistable() &&
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
feed_outputs.end()) {
parameters.push_back(node->Name());
}
}
......@@ -215,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size");
.RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size");
......@@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
if (use_gpu) {
......@@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
......@@ -105,12 +107,14 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
}
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size) {
int max_batch_size,
int min_subgraph_size) {
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
// Append after the infer_clean pass.
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
tensorrt_min_subgraph_size_ = min_subgraph_size;
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
......
......@@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
}
if (config_.use_mkldnn_) {
......
......@@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// TODO(Superjomn) should avoid the case when a TensorArray is a
// parameter.
if (var_name == "feed" || var_name == "fetch") continue;
if (var->Type() == typeid(framework::LoDTensorArray)) {
if (var->IsType<framework::LoDTensorArray>()) {
VLOG(4) << "collect " << var_name;
arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
}
......
......@@ -27,8 +27,11 @@ namespace details {
// training phase.
struct TensorArrayBatchCleaner {
TensorArrayBatchCleaner() {
valid_types_.insert(typeid(framework::Tensor));
valid_types_.insert(typeid(framework::LoDTensor));
constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
constexpr auto kLoDTensorId =
framework::VarTypeTrait<framework::LoDTensor>::kId;
valid_types_.insert(kTensorId);
valid_types_.insert(kLoDTensorId);
}
// Collect the variables that are not Tensor or LoDTensor, and reset them to a
// bool(trick), because some of them are containers, and some operators just
......@@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner {
bool no_tensor_flag_{true};
std::vector<framework::LoDTensorArray *> arrays_;
std::unordered_set<std::type_index> valid_types_;
std::unordered_set<int> valid_types_;
std::unordered_set<framework::Variable *> no_tensor_vars_;
};
......
......@@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor,
}
}
template <typename T>
static void TensorAssignData(PaddleTensor *tensor,
const std::vector<std::vector<T>> &data,
const std::vector<size_t> &lod) {
int size = lod[lod.size() - 1];
tensor->shape.assign({size, 1});
tensor->lod.assign({lod});
TensorAssignData(tensor, data);
}
template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
......
......@@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig {
bool use_feed_fetch_ops{true};
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1);
int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN();
......@@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig {
bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_;
// While TensorRT allows an engine optimized for a given max batch size
// to run at any smaller size, the performance for those smaller
// sizes may not be as well-optimized. Therefore, Max batch is best
// equivalent to the runtime batch size.
int tensorrt_max_batchsize_;
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool model_from_memory_{false};
};
......
......@@ -118,11 +118,13 @@ class GpuPassStrategy : public PassStrategy {
public:
GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"infer_clean_graph_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
});
}
......
......@@ -98,10 +98,8 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
auto one_batch = data->NextBatch();
PaddleTensor input_tensor;
input_tensor.name = "word";
input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
input_tensor.lod.assign({one_batch.lod});
input_tensor.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
input_slots->assign({input_tensor});
}
......
......@@ -19,11 +19,9 @@ namespace inference {
using contrib::AnalysisConfig;
struct DataRecord {
std::vector<std::vector<int64_t>> query_data_all, title_data_all;
std::vector<std::vector<int64_t>> query, title;
std::vector<size_t> lod1, lod2;
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
......@@ -33,22 +31,9 @@ struct DataRecord {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= query_data_all.size()) {
data.query_data_all.assign(query_data_all.begin() + batch_iter,
query_data_all.begin() + batch_end);
data.title_data_all.assign(title_data_all.begin() + batch_iter,
title_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
CHECK(!data.query_data_all.empty());
CHECK(!data.title_data_all.empty());
CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
for (size_t j = 0; j < data.query_data_all.size(); j++) {
// calculate lod
data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
}
if (batch_end <= query.size()) {
GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end);
GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end);
}
batch_iter += batch_size;
return data;
......@@ -67,8 +52,8 @@ struct DataRecord {
// load title data
std::vector<int64_t> title_data;
split_to_int64(data[1], ' ', &title_data);
query_data_all.push_back(std::move(query_data));
title_data_all.push_back(std::move(title_data));
query.push_back(std::move(query_data));
title.push_back(std::move(title_data));
}
num_samples = num_lines;
}
......@@ -80,15 +65,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
lod_query_tensor.name = "left";
lod_title_tensor.name = "right";
auto one_batch = data->NextBatch();
int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size
int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size
lod_query_tensor.shape.assign({size1, 1});
lod_query_tensor.lod.assign({one_batch.lod1});
lod_title_tensor.shape.assign({size2, 1});
lod_title_tensor.lod.assign({one_batch.lod2});
// assign data
TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query, one_batch.lod1);
TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title, one_batch.lod2);
// Set inputs.
input_slots->assign({lod_query_tensor, lod_title_tensor});
for (auto &tensor : *input_slots) {
......
......@@ -19,11 +19,9 @@ namespace inference {
using contrib::AnalysisConfig;
struct DataRecord {
std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
std::vector<std::vector<int64_t>> word, mention;
std::vector<size_t> lod; // two inputs have the same lod info.
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
......@@ -33,20 +31,10 @@ struct DataRecord {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= word_data_all.size()) {
data.word_data_all.assign(word_data_all.begin() + batch_iter,
word_data_all.begin() + batch_end);
data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
mention_data_all.begin() + batch_end);
// Prepare LoDs
data.lod.push_back(0);
CHECK(!data.word_data_all.empty());
CHECK(!data.mention_data_all.empty());
CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
for (size_t j = 0; j < data.word_data_all.size(); j++) {
// calculate lod
data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
}
if (batch_end <= word.size()) {
GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end);
GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter,
batch_end);
}
batch_iter += batch_size;
return data;
......@@ -65,8 +53,8 @@ struct DataRecord {
// load mention data
std::vector<int64_t> mention_data;
split_to_int64(data[3], ' ', &mention_data);
word_data_all.push_back(std::move(word_data));
mention_data_all.push_back(std::move(mention_data));
word.push_back(std::move(word_data));
mention.push_back(std::move(mention_data));
}
num_samples = num_lines;
}
......@@ -78,14 +66,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
lod_word_tensor.name = "word";
lod_mention_tensor.name = "mention";
auto one_batch = data->NextBatch();
int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size
lod_word_tensor.shape.assign({size, 1});
lod_word_tensor.lod.assign({one_batch.lod});
lod_mention_tensor.shape.assign({size, 1});
lod_mention_tensor.lod.assign({one_batch.lod});
// assign data
TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all);
TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all);
TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word, one_batch.lod);
TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention,
one_batch.lod);
// Set inputs.
input_slots->assign({lod_word_tensor, lod_mention_tensor});
for (auto &tensor : *input_slots) {
......
......@@ -18,12 +18,9 @@ namespace paddle {
namespace inference {
struct DataRecord {
std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
std::vector<std::vector<int64_t>> title1, title2, title3, l1;
std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
std::vector<size_t> lod1, lod2, lod3, l1_lod;
size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
......@@ -33,41 +30,11 @@ struct DataRecord {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= title1_all.size()) {
data.title1_all.assign(title1_all.begin() + batch_iter,
title1_all.begin() + batch_end);
data.title2_all.assign(title2_all.begin() + batch_iter,
title2_all.begin() + batch_end);
data.title3_all.assign(title3_all.begin() + batch_iter,
title3_all.begin() + batch_end);
data.l1_all.assign(l1_all.begin() + batch_iter,
l1_all.begin() + batch_end);
// Prepare LoDs
data.title1_lod.push_back(0);
data.title2_lod.push_back(0);
data.title3_lod.push_back(0);
data.l1_lod.push_back(0);
CHECK(!data.title1_all.empty());
CHECK(!data.title2_all.empty());
CHECK(!data.title3_all.empty());
CHECK(!data.l1_all.empty());
CHECK_EQ(data.title1_all.size(), data.title2_all.size());
CHECK_EQ(data.title1_all.size(), data.title3_all.size());
CHECK_EQ(data.title1_all.size(), data.l1_all.size());
for (size_t j = 0; j < data.title1_all.size(); j++) {
data.title1.push_back(data.title1_all[j]);
data.title2.push_back(data.title2_all[j]);
data.title3.push_back(data.title3_all[j]);
data.l1.push_back(data.l1_all[j]);
// calculate lod
data.title1_lod.push_back(data.title1_lod.back() +
data.title1_all[j].size());
data.title2_lod.push_back(data.title2_lod.back() +
data.title2_all[j].size());
data.title3_lod.push_back(data.title3_lod.back() +
data.title3_all[j].size());
data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
}
if (batch_end <= title1.size()) {
GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end);
GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end);
GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end);
GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end);
}
batch_iter += batch_size;
return data;
......@@ -92,10 +59,10 @@ struct DataRecord {
// load l1 data
std::vector<int64_t> l1_data;
split_to_int64(data[3], ' ', &l1_data);
title1_all.push_back(std::move(title1_data));
title2_all.push_back(std::move(title2_data));
title3_all.push_back(std::move(title3_data));
l1_all.push_back(std::move(l1_data));
title1.push_back(std::move(title1_data));
title2.push_back(std::move(title2_data));
title3.push_back(std::move(title3_data));
l1.push_back(std::move(l1_data));
}
num_samples = num_lines;
}
......@@ -109,24 +76,11 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
title3_tensor.name = "title3";
l1_tensor.name = "l1";
auto one_batch = data->NextBatch();
int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
title1_tensor.shape.assign({title1_size, 1});
title1_tensor.lod.assign({one_batch.title1_lod});
int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
title2_tensor.shape.assign({title2_size, 1});
title2_tensor.lod.assign({one_batch.title2_lod});
int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
title3_tensor.shape.assign({title3_size, 1});
title3_tensor.lod.assign({one_batch.title3_lod});
int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
l1_tensor.shape.assign({l1_size, 1});
l1_tensor.lod.assign({one_batch.l1_lod});
// assign data
TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
TensorAssignData<int64_t>(&title1_tensor, one_batch.title1, one_batch.lod1);
TensorAssignData<int64_t>(&title2_tensor, one_batch.title2, one_batch.lod2);
TensorAssignData<int64_t>(&title3_tensor, one_batch.title3, one_batch.lod3);
TensorAssignData<int64_t>(&l1_tensor, one_batch.l1, one_batch.l1_lod);
// Set inputs.
input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
for (auto &tensor : *input_slots) {
......
......@@ -176,6 +176,18 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
(*inputs).emplace_back(input_slots);
}
void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
std::vector<std::vector<int64_t>> *out,
std::vector<size_t> *lod, size_t batch_iter,
size_t batch_end) {
lod->clear();
lod->push_back(0);
for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) {
out->push_back(*it);
lod->push_back(lod->back() + (*it).size()); // calculate lod
}
}
void TestOneThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
......
......@@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel<T> {
output->mutable_data<T>(context.GetPlace());
} else {
PADDLE_THROW("Unexpected branch, input variable type is %s",
in_var->Type().name());
framework::ToTypeName(in_var->Type()));
}
PADDLE_ENFORCE_NOT_NULL(input);
......
......@@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase {
auto &og_inside =
detail::Ref(cur_scope.Var(inside_og_name),
"Cannot find inside gradient %s", inside_og_name);
if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
if (og_outside.IsType<framework::LoDTensor>()) {
auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
auto &inside_tensor =
detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
inside_tensor.set_lod(outside_tensor.lod());
inside_tensor.ShareDataWith(outside_tensor);
} else if (framework::IsType<framework::LoDTensorArray>(
og_outside.Type())) {
} else if (og_outside.IsType<framework::LoDTensorArray>()) {
auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
auto &inside_array =
detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
......@@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase {
var->IsType<LoDTensor>(),
"Currently the type of var only can be LoDTensorArray, "
"or LoDTensor, but the received var[%s] is %s.",
inside_grad_name, var->Type().name());
inside_grad_name, framework::ToTypeName(var->Type()));
if (var->IsType<LoDTensor>()) {
auto &inside_tensor = var->Get<framework::LoDTensor>();
......
......@@ -161,9 +161,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
if ((activation == "identity") &&
(algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
(!residual)) {
if ((activation == "identity") && (!residual)) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
// But test in some case, the speed is slower, change to use
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
......@@ -22,239 +22,6 @@ namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
struct CudnnRNNCache {
CudnnRNNCache() {
x_desc_ = NULL;
y_desc_ = NULL;
dx_desc_ = NULL;
dy_desc_ = NULL;
}
~CudnnRNNCache() { release(); }
cudnnRNNDescriptor_t rnn_desc_;
cudnnTensorDescriptor_t *x_desc_;
cudnnTensorDescriptor_t *y_desc_;
cudnnTensorDescriptor_t *dx_desc_;
cudnnTensorDescriptor_t *dy_desc_;
cudnnTensorDescriptor_t hx_desc_;
cudnnTensorDescriptor_t cx_desc_;
cudnnTensorDescriptor_t hy_desc_;
cudnnTensorDescriptor_t cy_desc_;
cudnnTensorDescriptor_t dhx_desc_;
cudnnTensorDescriptor_t dcx_desc_;
cudnnTensorDescriptor_t dhy_desc_;
cudnnTensorDescriptor_t dcy_desc_;
cudnnTensorDescriptor_t output_x_desc_;
cudnnTensorDescriptor_t output_y_desc_;
cudnnDropoutDescriptor_t dropout_desc_;
size_t weights_size_;
cudnnFilterDescriptor_t w_desc_;
cudnnFilterDescriptor_t dw_desc_;
size_t workspace_size_;
size_t reserve_size_;
Tensor reserve_data_;
Tensor workspace_data_;
Tensor dropout_state_;
size_t max_length_;
float dropout_prob_;
bool is_bidirec_;
int batch_size_;
int input_size_;
int hidden_size_;
int num_layers_;
int seed_;
void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
size_t max_len, int batch_size, int input_size, int hidden_size,
int num_layers, float dropout_prob, bool is_bidirec, int seed,
int weight_numel) {
max_length_ = max_len;
batch_size_ = batch_size;
input_size_ = input_size;
hidden_size_ = hidden_size;
num_layers_ = num_layers;
dropout_prob_ = dropout_prob;
is_bidirec_ = is_bidirec;
seed_ = seed;
x_desc_ = new cudnnTensorDescriptor_t[max_length_];
y_desc_ = new cudnnTensorDescriptor_t[max_length_];
dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
int dim_a[3];
int stride_a[3];
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
dim_a[0] = batch_size_;
dim_a[1] = input_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
dim_a[0] = batch_size_;
dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
}
dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
dim_a[1] = batch_size_;
dim_a[2] = hidden_size_;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
size_t state_size;
CUDNN_ENFORCE(
platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
dropout_state_.Resize({static_cast<int64_t>(state_size)}));
auto *dropout_state_data =
dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
#if CUDNN_VERSION >= 6000
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
#else
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT));
#endif
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
"cudnn lstm weight size should be SAME");
int dim_w[3];
dim_w[0] = weights_size_ / sizeof(float);
dim_w[1] = 1;
dim_w[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
}
void release() {
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
}
delete[] x_desc_;
delete[] y_desc_;
delete[] dx_desc_;
delete[] dy_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
}
};
template <typename T>
class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
public:
......@@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
auto input_w_numel = w->numel();
auto batch_size = x->dims()[1];
cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
hidden_size, num_layers, dropout_prob, is_bidirec,
seed, input_w_numel);
cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
input_size, hidden_size, num_layers, dropout_prob,
is_bidirec, seed, input_w_numel);
}
auto run_seq_len = x->dims()[0];
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
struct CudnnRNNCache {
CudnnRNNCache() {
x_desc_ = NULL;
y_desc_ = NULL;
dx_desc_ = NULL;
dy_desc_ = NULL;
}
~CudnnRNNCache() { release(); }
cudnnRNNDescriptor_t rnn_desc_;
cudnnTensorDescriptor_t *x_desc_;
cudnnTensorDescriptor_t *y_desc_;
cudnnTensorDescriptor_t *dx_desc_;
cudnnTensorDescriptor_t *dy_desc_;
cudnnTensorDescriptor_t hx_desc_;
cudnnTensorDescriptor_t cx_desc_;
cudnnTensorDescriptor_t hy_desc_;
cudnnTensorDescriptor_t cy_desc_;
cudnnTensorDescriptor_t dhx_desc_;
cudnnTensorDescriptor_t dcx_desc_;
cudnnTensorDescriptor_t dhy_desc_;
cudnnTensorDescriptor_t dcy_desc_;
cudnnTensorDescriptor_t output_x_desc_;
cudnnTensorDescriptor_t output_y_desc_;
cudnnDropoutDescriptor_t dropout_desc_;
size_t weights_size_;
cudnnFilterDescriptor_t w_desc_;
cudnnFilterDescriptor_t dw_desc_;
size_t workspace_size_;
size_t reserve_size_;
framework::Tensor reserve_data_;
framework::Tensor workspace_data_;
framework::Tensor dropout_state_;
size_t max_length_;
float dropout_prob_;
bool is_bidirec_;
int batch_size_;
int input_size_;
int hidden_size_;
int num_layers_;
int seed_;
void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
int batch_size, int input_size, int hidden_size, int num_layers,
float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
max_length_ = max_len;
batch_size_ = batch_size;
input_size_ = input_size;
hidden_size_ = hidden_size;
num_layers_ = num_layers;
dropout_prob_ = dropout_prob;
is_bidirec_ = is_bidirec;
seed_ = seed;
x_desc_ = new cudnnTensorDescriptor_t[max_length_];
y_desc_ = new cudnnTensorDescriptor_t[max_length_];
dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
int dim_a[3];
int stride_a[3];
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
dim_a[0] = batch_size_;
dim_a[1] = input_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
dim_a[0] = batch_size_;
dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
}
dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
dim_a[1] = batch_size_;
dim_a[2] = hidden_size_;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
size_t state_size;
CUDNN_ENFORCE(
platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
dropout_state_.Resize({static_cast<int64_t>(state_size)}));
auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
#if CUDNN_VERSION >= 6000
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
#else
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT));
#endif
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
"cudnn lstm weight size should be SAME");
int dim_w[3];
dim_w[0] = weights_size_ / sizeof(float);
dim_w[1] = 1;
dim_w[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
reserve_data_.mutable_data<uint8_t>(place);
workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
workspace_data_.mutable_data<uint8_t>(place);
}
void release() {
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
}
delete[] x_desc_;
delete[] y_desc_;
delete[] dx_desc_;
delete[] dy_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
}
};
} // namespace operators
} // namespace paddle
......@@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
} else {
PADDLE_THROW(
"% should be LoDTensor or SelectedRows, but the received type is %s",
ctx.Inputs("Ids")[0], ids_var->Type().name());
ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type()));
}
}
};
......
......@@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
z = ctx.Output<framework::LoDTensor>("Out");
} else {
PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
x_var->Type().name());
framework::ToTypeName(x_var->Type()));
}
z->mutable_data<T>(ctx.GetPlace());
......
......@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/fill_constant_op.h"
namespace paddle {
namespace operators {
class FillConstantInferShape : public framework::InferShapeBase {
class FillConstantOp : public framework::OperatorWithKernel {
public:
void operator()(framework::InferShapeContext *ctx) const override {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null.");
auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
};
class FillConstantOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
auto value = Attr<float>("value");
auto force_cpu = Attr<bool>("force_cpu");
framework::Tensor *tensor = nullptr;
auto &out_var = *scope.FindVar(Output("Out"));
if (out_var.IsType<framework::LoDTensor>()) {
tensor = out_var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else if (out_var.IsType<framework::SelectedRows>()) {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
if (force_cpu) {
auto cpu = platform::CPUPlace();
tensor->mutable_data(cpu, data_type);
} else {
tensor->mutable_data(dev_place, data_type);
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
math::set_constant(dev_ctx, tensor, value);
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.GetPlace());
}
};
class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
boost::get<int>(op_desc.GetAttr("dtype")));
auto& out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetDataType(data_type);
}
};
class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -107,7 +79,13 @@ Fill up a variable with specified constant value.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::FillConstantOpVarTypeInference);
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
ops::FillConstantOpVarTypeInference,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
ops::FillConstantKernel<double>,
ops::FillConstantKernel<int64_t>,
ops::FillConstantKernel<int>,
ops::FillConstantKernel<paddle::platform::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
ops::FillConstantKernel<double>,
ops::FillConstantKernel<int64_t>,
ops::FillConstantKernel<int>,
ops::FillConstantKernel<paddle::platform::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
class FillConstantKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto value = ctx.Attr<float>("value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
framework::Tensor *tensor = nullptr;
framework::Variable *out_var = ctx.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->Resize(
framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
} else if (out_var->IsType<framework::SelectedRows>()) {
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(
framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
if (force_cpu) {
tensor->mutable_data(platform::CPUPlace(), data_type);
} else {
tensor->mutable_data(ctx.GetPlace(), data_type);
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(ctx.GetPlace());
math::set_constant(dev_ctx, tensor, value);
}
};
} // namespace operators
} // namespace paddle
......@@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
ctx.Inputs("Grad").front(),
framework::ToTypeName(grad_var->Type()));
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto avg_squared_grad_out_tensor =
......
......@@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
......@@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref;
......
......@@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
ctx.Inputs("Grad").front(),
framework::ToTypeName(grad_var->Type()));
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
......@@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
ctx.Inputs("Grad").front(),
framework::ToTypeName(grad_var->Type()));
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
......@@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
ctx.Inputs("Grad").front(),
framework::ToTypeName(grad_var->Type()));
auto* param_out = ctx.Output<Tensor>("ParamOut");
auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
......
......@@ -395,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
PADDLE_THROW(
string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
"gradient, but the received Variable Type is %s",
grad_var->Type().name()));
framework::ToTypeName(grad_var->Type())));
}
}
};
......
......@@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
......
......@@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
} else {
PADDLE_THROW("Unexpected branch, output variable type is %s",
out_var->Type().name());
framework::ToTypeName(out_var->Type()));
}
}
};
......
......@@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel {
PADDLE_THROW("Cannot find the input data type by all input data");
}
PADDLE_THROW("Unexpected branch. Input type is %s",
x_vars[0]->Type().name());
framework::ToTypeName(x_vars[0]->Type()));
}
};
......
......@@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel<T> {
}
} else {
PADDLE_THROW("Unexpected branch, output variable type is %s",
out_var->Type().name());
framework::ToTypeName(out_var->Type()));
}
}
};
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/tracer.h"
namespace paddle {
......@@ -27,9 +26,7 @@ void BindTracer(pybind11::module *m) {
[](imperative::Tracer &self, framework::BlockDesc *root_block) {
new (&self) imperative::Tracer(root_block);
})
.def("trace", &imperative::Tracer::Trace)
.def("get_scope", &imperative::Tracer::GetScope,
pybind11::return_value_policy::reference);
.def("trace", &imperative::Tracer::Trace);
}
} // namespace pybind
......
......@@ -125,9 +125,7 @@ PYBIND11_MODULE(core, m) {
// .def(py::init<>())
.def(py::init<bool>(), py::arg("stop_gradient") = false)
.def("_run_backward",
[](imperative::VarBase &self, framework::Scope *scope) {
self.RunBackward(scope);
})
[](imperative::VarBase &self) { self.RunBackward(); })
.def("_grad_name", &imperative::VarBase::GradName)
.def("_grad", &imperative::VarBase::Grad)
.def_property("grad_value",
......@@ -154,9 +152,9 @@ PYBIND11_MODULE(core, m) {
[](const imperative::VarBase &self) { return self.stop_gradient_; },
[](imperative::VarBase &self, bool stop_gradient) {
self.stop_gradient_ = stop_gradient;
});
})
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<>())
.def_property(
"desc", [](const imperative::OpBase &self) { return self.op_desc_; },
......
......@@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#=================================================
# Utils
#=================================================
......@@ -418,13 +417,6 @@ EOF
else
ctest --output-on-failure
fi
# make install should also be test when unittest
make install -j `nproc`
pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
fi
}
......@@ -922,6 +914,7 @@ function main() {
;;
assert_api)
assert_api_not_changed ${PYTHON_ABI:-""}
assert_api_spec_approvals
;;
test_inference)
gen_capi_package
......@@ -946,6 +939,15 @@ function main() {
run_test
assert_api_not_changed ${PYTHON_ABI:-""}
;;
cmake_gen)
cmake_gen ${PYTHON_ABI:-""}
;;
gen_fluid_lib)
gen_fluid_lib
;;
test_fluid_lib)
test_fluid_lib
;;
*)
print_usage
exit 0
......
......@@ -15,6 +15,7 @@
from __future__ import print_function
import collections
from collections import defaultdict
import contextlib
import os
import re
......@@ -369,13 +370,11 @@ class Variable(object):
self._ivar.stop_gradient = stop_gradient
def _numpy(self):
scope = _imperative_tracer().get_scope()
tensor = core.get_variable_tensor(scope, self.desc.name())
tensor = self._ivar.value.get_tensor()
return np.array(tensor)
def _backward(self):
scope = _imperative_tracer().get_scope()
self._ivar._run_backward(scope)
self._ivar._run_backward()
def _gradient(self):
return np.array(self._ivar._grad())
......@@ -710,20 +709,20 @@ class Operator(object):
if _in_imperative_mode():
self.iop = core.OpBase()
self.iop.desc = self.desc
self.inputs = []
self.inputs = defaultdict(list)
if inputs is not None:
for inp in inputs.values():
if isinstance(inp, Variable):
self.inputs.append(inp)
elif isinstance(inp, list) or isinstance(inp, tuple):
self.inputs.extend(inp[:])
self.outputs = []
for k, v in six.iteritems(inputs):
if isinstance(v, Variable):
self.inputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.inputs[k].extend([var._ivar for var in v])
self.outputs = defaultdict(list)
if outputs is not None:
for out in outputs.values():
if isinstance(out, Variable):
self.outputs.append(out)
elif isinstance(out, list) or isinstance(out, tuple):
self.outputs.extend(out[:])
for k, v in six.iteritems(outputs):
if isinstance(v, Variable):
self.outputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.outputs[k].extend([var._ivar for var in v])
def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET
......@@ -1302,8 +1301,7 @@ class Block(object):
def _trace_op(self, op, stop_gradient=False):
if _in_imperative_mode():
_imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
[v._ivar for v in op.outputs], self.desc,
_imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
stop_gradient)
def _insert_op(self, index, *args, **kwargs):
......
......@@ -45,8 +45,7 @@ def to_variable(value, block=None):
name=None,
shape=value.shape,
dtype=value.dtype)
scope = framework._imperative_tracer().get_scope()
var = scope.var(py_var.name)
var = py_var._ivar.value
tensor = var.get_tensor()
tensor.set(value, core.CPUPlace())
return py_var
......
......@@ -20,7 +20,7 @@ import six
import sys
import numpy as np
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
from . import unique_name
from paddle.fluid.imperative import base as imperative_base
from paddle.fluid.initializer import Constant, Xavier
......@@ -313,11 +313,22 @@ class LayerHelper(object):
param = self._create_weight_normalize(attr, shape, dtype)
WeightNormParamAttr.params_with_weight_norm.append(param)
return param
self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs())
if _in_imperative_mode():
self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs())
# In imperative mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
**attr._to_kwargs(with_initializer=True))
else:
self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
**attr._to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs())
def get_parameter(self, name):
param = self.main_program.global_block().var(name)
......
......@@ -148,7 +148,7 @@ class ParallelExecutor(object):
trainers_endpoints), "num_trainers == len(end_points)"
build_strategy.trainers_endpoints = trainers_endpoints
# step5: get persistable_vars, parameter_vars, places. persistable_vars
# step6: get persistable_vars, places. persistable_vars
# need be broadcast to other local_scope.
persistable_vars = set([
cpt.to_text(v.name) for v in [
......@@ -164,7 +164,7 @@ class ParallelExecutor(object):
places = list(map(place_obj, self._places))
# step6: init ParallelExecutor
# step7: init ParallelExecutor
self.executor = core.ParallelExecutor(
places, persistable_vars, main.desc,
cpt.to_text(loss_name)
......
......@@ -185,8 +185,10 @@ def main(use_cuda, parallel):
if __name__ == '__main__':
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
if not on_ci:
for use_cuda in (False, True):
for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
......@@ -15,6 +15,18 @@
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
def __func__(*args, **kwargs):
if on_ci:
return
return func(*args, **kwargs)
return __func__
class TestDistSeResneXt2x2(TestDistBase):
......@@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase):
self._sync_mode = True
self._use_reader_alloc = False
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
......@@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase):
self._mem_opt = True
self._use_reader_alloc = False
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
......@@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
self._sync_mode = False
self._use_reader_alloc = False
@skip_ci
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100)
......
......@@ -38,7 +38,9 @@ class MyLayer(fluid.imperative.PyLayer):
def forward(self, inputs):
x = fluid.layers.relu(inputs)
self._x_for_debug = x
return [fluid.layers.elementwise_mul(x, x)]
x = fluid.layers.elementwise_mul(x, x)
x = fluid.layers.reduce_sum(x)
return [x]
class MLP(fluid.imperative.PyLayer):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import contextlib
import unittest
from functools import partial
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
def get_places():
places = []
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
@contextlib.contextmanager
def prog_scope_guard(main_prog, startup_prog):
scope = fluid.core.Scope()
with fluid.unique_name.guard():
with fluid.scope_guard(scope):
with fluid.program_guard(main_prog, startup_prog):
yield
def bow_net(data,
label,
dict_dim,
is_sparse=False,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
emb = fluid.layers.embedding(
input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
class TestWeightDecay(unittest.TestCase):
def setUp(self):
self.word_dict = paddle.dataset.imdb.word_dict()
reader = paddle.batch(
paddle.dataset.imdb.train(self.word_dict), batch_size=4)()
self.train_data = [next(reader) for _ in range(5)]
self.learning_rate = .5
def run_executor(self, place, feed_list, loss):
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe.run(fluid.default_startup_program())
main_prog = fluid.default_main_program()
loss_set = []
for data in self.train_data:
out = exe.run(main_prog,
feed=feeder.feed(data),
fetch_list=[loss.name])
print("loss %s" % (np.average(out)))
loss_set.append(np.average(out))
return loss_set
def run_parallel_exe(self,
place,
feed_list,
loss,
use_cuda=True,
use_reduce=False,
use_fast_executor=False,
use_ir_memory_optimize=False):
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
exe.run(fluid.default_startup_program())
exec_strategy = fluid.ExecutionStrategy()
if use_fast_executor:
exec_strategy.use_experimental_executor = True
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.memory_optimize = use_ir_memory_optimize
parallel_exe = fluid.ParallelExecutor(
use_cuda,
loss_name=loss.name,
exec_strategy=exec_strategy,
build_strategy=build_strategy)
loss_set = []
for data in self.train_data:
out = parallel_exe.run(feed=feeder.feed(data),
fetch_list=[loss.name])
print("loss %s" % (np.average(out)))
loss_set.append(np.average(out))
return loss_set
def check_weight_decay(self,
place,
model,
use_parallel_exe=False,
use_reduce=False):
main_prog = fluid.framework.Program()
startup_prog = fluid.framework.Program()
startup_prog.random_seed = 1
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
avg_cost = model(data, label, len(self.word_dict))
param_list = [(var, var * self.learning_rate)
for var in main_prog.block(0).all_parameters()]
optimizer = fluid.optimizer.Adagrad(
learning_rate=self.learning_rate)
optimizer.minimize(avg_cost)
for params in param_list:
updated_p = fluid.layers.elementwise_sub(
x=params[0], y=params[1])
fluid.layers.assign(input=updated_p, output=params[0])
if use_parallel_exe:
loss = self.run_parallel_exe(
place, [data, label],
loss=avg_cost,
use_cuda=True,
use_reduce=use_reduce)
else:
loss = self.run_executor(place, [data, label], loss=avg_cost)
return loss
def test_weight_decay(self):
model = partial(bow_net, is_sparse=False)
for place in get_places():
loss = self.check_weight_decay(place, model, use_parallel_exe=False)
loss2 = self.check_weight_decay(
place, model, use_parallel_exe=True, use_reduce=False)
for i in range(len(loss)):
assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5)
loss3 = self.check_weight_decay(
place, model, use_parallel_exe=True, use_reduce=True)
for i in range(len(loss)):
assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册