提交 ddfb9f11 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_shared_ptr

test=develop
......@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
if (HAVE_SHLWAPI)
set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
endif(HAVE_SHLWAPI)
endif (WIN32)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
......
......@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
common_link(${TARGET_NAME})
endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
endfunction(cc_binary)
function(cc_test TARGET_NAME)
......@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
if(WIN32)
list(APPEND win32_deps shlwapi)
if("${cc_test_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_test_DEPS python)
list(APPEND win32_deps ${PYTHON_LIBRARIES})
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
endif()
endif(WIN32)
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
if(WIN32)
target_link_libraries(${TARGET_NAME} ${win32_deps})
endif(WIN32)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME})
add_test(NAME ${TARGET_NAME}
......@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME})
......@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME})
......
......@@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
......@@ -210,6 +211,7 @@ paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], va
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......
......@@ -48,6 +48,17 @@ pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 3 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
......
......@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return ac_out_var;
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode *patterns::TransposeFlattenConcat::operator()(
std::vector<PDNode *> conv_in, int times) {
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const int kNumFields = 4;
const int kTransOutOffset = 1;
const int kFlattenOffset = 2;
const int kFlattenOutOffset = 3;
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
->assert_is_op("transpose2"));
nodes.push_back(
pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
->assert_is_op_output("transpose2")
->assert_is_op_input("flatten2", "X")
->AsIntermediate());
nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
->assert_is_op("flatten2"));
nodes.push_back(
pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
->assert_is_op_output("flatten2")
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
auto concat_op = pattern->NewNode(GetNodeName("concat"))
->assert_is_op("concat")
->assert_op_has_n_inputs("concat", times);
auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
->assert_is_op_output("concat")
->AsOutput();
std::vector<PDNode *> flatten_outs;
for (int i = 0; i < times; i++) {
conv_in[i]->AsInput();
// trans
nodes[i * kNumFields]->LinksFrom({conv_in[i]});
// trans_out
nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
// flatten
nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
{nodes[i * kNumFields + kTransOutOffset]});
// flatten_out
nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
{nodes[i * kNumFields + kFlattenOffset]});
flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
}
concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
return concat_out;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE(ac_out); // Out
};
struct TransposeFlattenConcat : public PatternBase {
TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
} // namespace patterns
// Link two ir::Nodes from each other.
......
......@@ -50,7 +50,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
// the other one should be unused empty var.
if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
x->outputs[1]->outputs.size() == 0;
x->outputs[1]->outputs.empty();
} else {
satisfied_all =
satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name =
"transpose_flatten" + std::to_string(times) + "_concat_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("transpose2", "X")
->AsInput());
}
patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
const int kNumFields = 5;
const int kTransOffset = 1;
const int kTransOutOffset = 2;
const int kFlattenOffset = 3;
const int kFlattenOutOffset = 4;
std::vector<Node *> nodes;
for (int i = 0; i < times; i++) {
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
nodes.push_back(subgraph.at(input_nodes[i]));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
}
Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
std::vector<std::string> input_names;
std::vector<int> trans_axis = boost::get<std::vector<int>>(
nodes[kTransOffset]->Op()->GetAttr("axis"));
int flatten_axis =
boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
std::string output_name = concat_out->Name();
for (int i = 0; i < times; i++) {
input_names.push_back(nodes[i * kNumFields]->Name());
}
framework::OpDesc new_op_desc;
new_op_desc.SetType("fusion_transpose_flatten_concat");
new_op_desc.SetInput("X", input_names);
new_op_desc.SetAttr("trans_axis", trans_axis);
new_op_desc.SetAttr("flatten_axis", flatten_axis);
new_op_desc.SetAttr("concat_axis", concat_axis);
new_op_desc.SetOutput("Out", {output_name});
new_op_desc.Flush();
// Create a new node for the fused op.
auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
std::unordered_set<const Node *> delete_nodes;
for (int i = 0; i < times; i++) {
nodes[i * kNumFields]->outputs.push_back(new_conv_op);
new_conv_op->inputs.push_back(nodes[i * kNumFields]);
delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
}
delete_nodes.insert(concat_op);
new_conv_op->outputs.push_back(concat_out);
concat_out->inputs.push_back(new_conv_op);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), delete_nodes);
};
gpd(graph.get(), handler);
return graph;
}
template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
template class TransposeFlattenConcatFusePass<6>;
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class TransposeFlattenConcatFusePass : public FusePassBase {
public:
virtual ~TransposeFlattenConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -391,7 +391,7 @@ class ExecutionContext {
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_GE(allocation_ptr->size(),
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
......
......@@ -27,6 +27,8 @@
namespace paddle {
namespace imperative {
std::map<int, py::object> py_funcs_;
using framework::Variable;
void AddTo(Variable* src, Variable* dst) {
......@@ -55,6 +57,7 @@ class Autograd {
if (var->stop_gradient_) {
return;
}
VLOG(3) << "start autograd";
std::deque<OpBase*> ready;
ready.push_back(var->pre_op_);
......@@ -120,51 +123,57 @@ framework::LoDTensor& VarBase::GradValue() {
}
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_) {
if (!grad_op_desc_ && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type();
return {};
}
VLOG(3) << "op grad " << grad_op_desc_->Type();
std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
tmp_vars.emplace_back(tmp_var);
outputs.push_back(tmp_var);
if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad";
grad_outputs["Out@GRAD"] =
PyLayer::ApplyGrad(backward_id_, grad_input_vars_["X@GRAD"]);
} else {
VLOG(3) << "op grad " << grad_op_desc_->Type();
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
}
}
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
framework::Scope scope;
platform::CPUPlace place;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
p.op.RuntimeInferShape(scope, place, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
}
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i];
framework::Variable* orig_grad = origin_outputs[i];
AddTo(outputs[i], orig_grad);
AddTo(grad, orig_grad);
delete grad;
}
}
return input_vars_;
......@@ -173,6 +182,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
void VarBase::RunBackward() {
if (!pre_op_) return;
VLOG(3) << "start backward";
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
float* data = grads_t->mutable_data<float>(platform::CPUPlace());
std::fill(data, data + grads_t->numel(), 1.0);
......@@ -183,5 +193,65 @@ void VarBase::RunBackward() {
Autograd().RunBackward(this);
}
void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
py_funcs_[func_id] = py_func;
}
int PyLayer::NumFuncs() { return py_funcs_.size(); }
std::vector<VarBase*> PyLayer::Apply(int func_id,
const std::vector<VarBase*>& inputs) {
std::vector<framework::Variable*> invars;
for (const VarBase* in : inputs) {
invars.push_back(in->var_);
}
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
std::vector<VarBase*> ret;
for (Variable* v : outvars) {
ret.push_back(new VarBase(v, new VarBase(true)));
}
return ret;
}
std::vector<Variable*> PyLayer::ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs) {
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
return CallPythonFunc(py_funcs_[func_id], inputs);
}
std::vector<framework::Variable*> PyLayer::CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins) {
py::gil_scoped_acquire guard;
py::tuple in_args(ins.size());
for (size_t i = 0; i < ins.size(); ++i) {
const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
}
VLOG(3) << "pyfunc in " << py::len(in_args);
// TODO(panyx0718): Who owns the returned LoDTensor.
auto ret = callable(in_args);
auto ret_tuple = py::cast<py::tuple>(ret);
size_t ret_num = py::len(ret_tuple);
std::vector<framework::Variable*> outs;
VLOG(3) << "pyfunc out " << ret_num;
for (size_t i = 0; i < ret_num; ++i) {
try {
auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
"Output tensor %d should not be nullptr", i);
auto* var = new framework::Variable();
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(*py_out_tensor);
tensor->set_lod(py_out_tensor->lod());
outs.push_back(var);
} catch (py::cast_error&) {
PADDLE_THROW("The %d-th output must be LoDTensor", i);
}
}
return outs;
}
} // namespace imperative
} // namespace paddle
......@@ -22,12 +22,15 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
namespace imperative {
namespace py = ::pybind11;
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
......@@ -90,16 +93,21 @@ class OpBase;
*/
class VarBase {
public:
VarBase()
VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
// Owns `var` and `grad`
VarBase(framework::Variable* var, VarBase* grad)
: pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1),
var_desc_(nullptr),
var_(new framework::Variable()),
grads_(new VarBase(true)),
var_(var),
grads_(grad),
stop_gradient_(false) {}
explicit VarBase(bool stop_gradient)
: pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1),
var_desc_(nullptr),
var_(new framework::Variable()),
......@@ -144,7 +152,11 @@ class VarBase {
*/
class OpBase {
public:
OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
OpBase()
: op_desc_(nullptr),
forward_id_(-1),
grad_op_desc_(nullptr),
backward_id_(-1) {}
virtual ~OpBase() {
if (grad_op_desc_) delete grad_op_desc_;
......@@ -152,8 +164,14 @@ class OpBase {
std::map<std::string, std::vector<VarBase*>> ApplyGrad();
// One of `op_desc_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_;
int forward_id_;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// not both.
framework::OpDesc* grad_op_desc_;
int backward_id_;
VarBasePtrMap input_vars_;
VarBasePtrMap output_vars_;
......@@ -173,8 +191,25 @@ class Layer {
std::vector<VarBase> vars;
return vars;
}
};
class PyLayer {
public:
virtual ~PyLayer() {}
static void RegisterFunc(int func_id, const py::object& py_func);
static int NumFuncs();
static std::vector<VarBase*> Apply(int func_id,
const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs);
virtual void Backward() { LOG(ERROR) << "To support customize"; }
private:
static std::vector<framework::Variable*> CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins);
};
} // namespace imperative
......
......@@ -115,8 +115,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if (!stop_gradient) {
framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
// TODO(panyx): Is this leaked?
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
op->grad_op_desc_ = grad_op_desc;
for (auto it : grad_op_desc->Inputs()) {
......@@ -127,13 +129,15 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
// Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_);
} else {
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_);
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
}
grad_in_vars.push_back(var->grads_->var_);
// Douts.
grad_in_vars.push_back(var->grads_);
}
}
}
......@@ -145,10 +149,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_);
if (!var->grads_->IsInitialized()) {
InitVar(var->var_, var->grads_);
}
grad_out_vars.push_back(var->grads_->var_);
grad_out_vars.push_back(var->grads_);
}
}
}
......@@ -156,5 +160,54 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op->block_ = block;
}
std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
const std::vector<VarBase*>& inputs,
bool stop_gradient) {
VLOG(3) << "py_trace";
op->input_vars_["X"] = inputs;
op->output_vars_["Out"] = PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) {
if (inp->pre_op_) {
op->pre_ops_["X"].push_back(inp->pre_op_);
op->pre_ops_out_idx_["X"].push_back(inp->pre_op_out_idx_);
} else {
op->pre_ops_["X"].push_back(nullptr);
}
}
auto& outputs = op->output_vars_["Out"];
for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* out = outputs[i];
out->stop_gradient_ = stop_gradient;
out->pre_op_ = op;
out->pre_op_out_name_ = "Out";
out->pre_op_out_idx_ = i;
}
if (!stop_gradient) {
auto& grad_input_vars = op->grad_input_vars_["X@GRAD"];
auto& grad_output_vars = op->grad_output_vars_["Out@GRAD"];
for (const VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_);
}
for (VarBase* out : outputs) {
grad_input_vars.push_back(out->var_);
}
for (VarBase* out : outputs) {
grad_input_vars.push_back(out->grads_);
if (!grad_input_vars.back()->IsInitialized()) {
InitVar(out->var_, grad_input_vars.back());
}
}
for (const VarBase* inp : inputs) {
grad_output_vars.push_back(inp->grads_);
if (!grad_output_vars.back()->IsInitialized()) {
InitVar(inp->var_, grad_output_vars.back());
}
}
}
return outputs;
}
} // namespace imperative
} // namespace paddle
......@@ -45,6 +45,9 @@ class Tracer {
const std::map<std::string, std::vector<VarBase*>>& outputs,
framework::BlockDesc* block, const bool stop_gradient = false);
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false);
private:
framework::BlockDesc* root_block_;
};
......
......@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
Update();
}
void contrib::AnalysisConfig::Update() {
......
......@@ -128,8 +128,8 @@ else()
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
snappystream ${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated.
set(DEPS ${DEPS} libcmt shlwapi)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
set(DEPS ${DEPS} libcmt ${os_dependency_modules})
endif(NOT WIN32)
if(WITH_GPU)
......
......@@ -141,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass", //
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
......
......@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
nvinfer1::ILayer* layer = nullptr;
framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
......@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
if (op_type_ == "add") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, shift_weights.get(),
scale_weights.get(), power_weights.get());
layer = scale_layer;
} else if (op_type_ == "mul") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, scale_weights.get(),
shift_weights.get(), power_weights.get());
layer = scale_layer;
}
nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
layer->setName(
("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0));
......@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_->DeclareOutput(output_name);
}
}
protected:
std::string op_type_;
};
class ElementwiseTensorOpConverter : public OpConverter {
......@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{"max", nvinfer1::ElementWiseOperation::kMAX},
};
class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
};
class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
};
class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
......@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
ElementwiseWeightAddOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
ElementwiseWeightMulOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
ElementwiseTensorAddOpConverter);
......
......@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
if(WIN32)
target_link_libraries(visualizer shlwapi)
endif(WIN32)
......@@ -137,6 +137,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t algo;
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
bool half_float = false;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
......@@ -157,8 +158,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
VLOG(5) << "NOT use cudnn_tensor_op_math";
}
#endif
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
......@@ -181,26 +180,21 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
.Var(kCUDNNFwdAlgoCache)
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
}
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
algo = algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
fwd_perf_stat;
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
auto cudnn_find_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, cudnn_output_desc,
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
fwd_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
......@@ -225,23 +219,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace_ptr, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
}
};
......@@ -365,20 +353,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit = max_user_size * 1024 * 1024;
}
Tensor cudnn_workspace;
void* cudnn_workspace_ptr = nullptr;
if ((input_data || filter_data) && exhaustive_search) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_limit)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
if (exhaustive_search) {
......@@ -396,22 +374,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
}
data_algo = data_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdDataAlgoPerf_t,
kNUM_CUDNN_BWD_DATA_ALGS>
data_perf_stat;
CUDNN_ENFORCE(platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx(
handle, cudnn_filter_desc, filter_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_input_desc,
input_grad_data, kNUM_CUDNN_BWD_DATA_ALGS,
&returned_algo_count, data_perf_stat.data(),
cudnn_workspace_ptr, workspace_size_limit));
auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx(
handle, cudnn_filter_desc, filter_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_input_desc, input_grad_data,
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
data_perf_stat.data(), cudnn_workspace,
workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_data_func,
workspace_size_limit);
VLOG(3) << "Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
......@@ -462,23 +443,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->GetMutable<
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
}
filter_algo = f_algo_cache->GetAlgorithm(
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
int returned_algo_count;
std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
kNUM_CUDNN_BWD_FILTER_ALGS>
filter_perf_stat;
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle, cudnn_input_desc, input_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_filter_desc, filter_grad_data,
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
filter_perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle, cudnn_input_desc, input_data,
cudnn_output_grad_desc, output_grad_data,
cudnn_conv_desc, cudnn_filter_desc,
filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
&returned_algo_count, filter_perf_stat.data(),
cudnn_workspace, workspace_size_limit));
};
workspace_handle.RunFunc(cudnn_find_bd_f_func,
workspace_size_limit);
return filter_perf_stat[0].algo;
});
VLOG(3) << "cuDNN backward filter algo " << filter_algo;
......@@ -499,16 +482,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
// ------------------- cudnn conv workspace ---------------------
if (!cudnn_workspace_ptr) {
cudnn_workspace =
ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
framework::make_ddim(
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
}
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) {
......@@ -516,12 +489,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
cudnn_workspace_ptr, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
}
// ------------------- cudnn conv backward filter ---------------------
......@@ -529,12 +505,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
cudnn_conv_desc, filter_algo, cudnn_workspace_ptr,
workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + i * group_offset_filter));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc,
input_data + i * group_offset_in, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
}
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/data_norm_op.h"
#include <string>
#include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class DataNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
PADDLE_ENFORCE(ctx->HasOutput("Means"), "");
PADDLE_ENFORCE(ctx->HasOutput("Scales"), "");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int64_t C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("Means", {C});
ctx->SetOutputDim("Scales", {C});
ctx->ShareLoD("X", "Y");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type = ctx.Input<Tensor>("X")->type();
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto dn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
dn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSize")->type(),
"BatchSize input should be of float type");
PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input<Tensor>("BatchSum")->type(),
"BatchSum input should be of float type");
PADDLE_ENFORCE_EQ(dn_param_type,
ctx.Input<Tensor>("BatchSquareSum")->type(),
"BatchSquareSum input should be of float type");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library);
}
};
class DataNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
// AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<float>("epsilon", "")
.SetDefault(1e-4)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001.");
});
AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
AddInput("X", "The input tensor");
AddInput("BatchSize",
"BatchSize is a 1-dimensional tensor of size C "
"that is applied to the output");
AddInput("BatchSum",
"BatchSum is a 1-dimensional tensor of size C "
"that is applied to the output");
AddInput("BatchSquareSum",
"The global BatchSquareSum (for training) or "
"estimated BatchSquareSum (for testing)");
AddOutput("Y", "result after normalization");
AddOutput("Means",
"Mean of the history data batch, "
"will apply to output when training")
.AsIntermediate();
AddOutput("Scales",
"Scales of the history data batch, "
"will apply to output when training")
.AsIntermediate();
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
Data Normalization.
Can be used as a normalizer function for data
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}
};
template <typename T>
class DataNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
// const bool is_test = ctx.Attr<bool>("is_test");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
const int N = x_dims[0];
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("Means");
auto *scales = ctx.Output<Tensor>("Scales");
// alloc memory
y->mutable_data<T>(ctx.GetPlace());
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
ConstEigenVectorArrayMap<T> b_size_arr(
ctx.Input<Tensor>("BatchSize")->data<T>(), C);
ConstEigenVectorArrayMap<T> b_sum_arr(
ctx.Input<Tensor>("BatchSum")->data<T>(), C);
ConstEigenVectorArrayMap<T> b_square_sum_arr(
ctx.Input<Tensor>("BatchSquareSum")->data<T>(), C);
EigenVectorArrayMap<T> means_arr(mean_out->mutable_data<T>(ctx.GetPlace()),
C);
EigenVectorArrayMap<T> scales_arr(scales->mutable_data<T>(ctx.GetPlace()),
C);
means_arr = b_sum_arr / b_size_arr;
scales_arr = (b_size_arr / b_square_sum_arr).sqrt();
switch (data_layout) {
case DataLayout::kNCHW: // because it's two dimensions, so make no
// difference
case DataLayout::kNHWC: {
EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C, N) =
(ConstEigenArrayMap<T>(x->data<T>(), C, N).colwise() - means_arr)
.colwise() *
scales_arr;
break;
}
default:
PADDLE_THROW("Unknown storage order: %d", data_layout);
}
}
};
class DataNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"));
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSize"), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSum"), "");
PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), "");
PADDLE_ENFORCE(ctx->HasInput("Means"), "");
PADDLE_ENFORCE(ctx->HasInput("Scales"), "");
// check output
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), "");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), "");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")),
"");
const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C});
ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C});
ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace(), layout, library);
}
};
template <typename T>
class DataNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *batch_size = ctx.Input<Tensor>("BatchSize");
const auto *batch_sum = ctx.Input<Tensor>("BatchSum");
const auto *batch_square_sum = ctx.Input<Tensor>("BatchSquareSum");
const auto *scales = ctx.Input<Tensor>("Scales");
const auto *means = ctx.Input<Tensor>("Means");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2");
const int N = x_dims[0];
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_batch_size =
ctx.Output<Tensor>(framework::GradVarName("BatchSize"));
auto *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"));
auto *d_batch_square_sum =
ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"));
EigenVectorArrayMap<T> d_batch_size_arr(
d_batch_size->mutable_data<T>(ctx.GetPlace()), C);
EigenVectorArrayMap<T> d_batch_sum_arr(
d_batch_sum->mutable_data<T>(ctx.GetPlace()), C);
EigenVectorArrayMap<T> d_batch_square_sum_arr(
d_batch_square_sum->mutable_data<T>(ctx.GetPlace()), C);
d_batch_size_arr.setZero();
d_batch_sum_arr.setZero();
d_batch_square_sum_arr.setZero();
const float epsilon = ctx.Attr<float>("epsilon");
switch (
data_layout) { // because it's two dimensions, so make no difference
case DataLayout::kNCHW:
case DataLayout::kNHWC: {
ConstEigenVectorArrayMap<T> scales_arr(scales->data<T>(), C);
ConstEigenVectorArrayMap<T> means_arr(means->data<T>(), C);
ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N);
EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C, N);
d_x_arr.setZero();
for (int nc = 0; nc < N; ++nc) {
d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr;
}
// calculate data sum and squre sum
ConstEigenVectorArrayMap<T> batch_size_arr(batch_size->data<T>(), C);
ConstEigenVectorArrayMap<T> batch_sum_arr(batch_sum->data<T>(), C);
ConstEigenVectorArrayMap<T> batch_square_sum_arr(
batch_square_sum->data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> sample_sum(C);
Eigen::Array<T, Eigen::Dynamic, 1> sample_square_sum(C);
// calculate data sample sum and square sum
sample_sum.setZero();
sample_square_sum.setZero();
for (int nc = 0; nc < N; ++nc) {
sample_sum += x_arr.col(nc);
sample_square_sum += (x_arr.col(nc) - means_arr).square();
}
// calculate gradient
d_batch_size_arr.setConstant(N);
d_batch_sum_arr = sample_sum;
d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon;
break;
}
default:
PADDLE_THROW("Unknown storage order: %s", data_layout_str);
}
}
};
class DataNormGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op = new framework::OpDesc();
op->SetType("data_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetInput("BatchSize", Input("BatchSize"));
op->SetInput("BatchSum", Input("BatchSum"));
op->SetInput("BatchSquareSum", Input("BatchSquareSum"));
op->SetInput("Scales", Output("Scales"));
op->SetInput("Means", Output("Means"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize"));
op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum"));
op->SetOutput(framework::GradVarName("BatchSquareSum"),
InputGrad("BatchSquareSum"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker,
ops::DataNormGradMaker);
REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);
REGISTER_OP_CPU_KERNEL(
data_norm, ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
data_norm_grad,
ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class DataNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class DataNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle
......@@ -52,11 +52,11 @@ struct BenchFunc {
for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...);
}
auto start = paddle::platform::PosixInNsec() / 1e-3;
auto start = paddle::platform::PosixInNsec() * 1e-3;
for (int i = 0; i < FLAGS_repeat; ++i) {
tgt(args...);
}
auto end = paddle::platform::PosixInNsec() / 1e-3;
auto end = paddle::platform::PosixInNsec() * 1e-3;
return static_cast<double>(end - start) / FLAGS_repeat;
}
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
"Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
"The 2nd dimension of "
"Input(Label) should be 1.");
ctx->SetOutputDim("Y", {x_dims[0], 1});
ctx->ShareLoD("X", /*->*/ "Y");
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context());
}
};
class TeacherStudentSigmoidLossGradientOp
: public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal.");
PADDLE_ENFORCE_EQ(dy_dims[1], 1,
"The 2nd dimension of Input(Y@Grad) should be 1.");
PADDLE_ENFORCE_EQ(label_dims[1], 1,
"When Attr(soft_label) == false, the 2nd dimension of "
"Input(Label) should be 1.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context());
}
};
class TeacherStudentSigmoidLossOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
" where N is the batch size and D is the output. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator.");
AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. "
"Label is a Tensor<float> with shape [N x 1]. ");
AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The teacher student sigmoid loss.");
AddAttr<float>(
"soft_max_up_bound",
"fp32, if input > soft_max_up_bound, will be bound, default 15.0")
.SetDefault(15.0);
AddAttr<float>(
"soft_max_lower_bound",
"fp32, if input < soft_max_lower_bound, will be bound, default -15.0")
.SetDefault(-15.0);
AddComment(R"DOC(
TeacherStudentSigmoidLoss Operator.
It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
we add another label(z') to original.
loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
z is click or not
z' is teacher value
label = {-2, -1, [0, 2]}
when z' is not exist, clk = 0 : label = -2;
when z' is not exist, clk = 1 : label = -1;
when z' is exist , clk = 0 : label = 0 + z';
when z' is exist , clk = 1 : label = 1 + z';
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(teacher_student_sigmoid_loss,
ops::TeacherStudentSigmoidLossOp,
ops::TeacherStudentSigmoidLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
ops::TeacherStudentSigmoidLossGradientOp);
REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss,
ops::TeacherStudentSigmoidLossOpKernel<float>,
ops::TeacherStudentSigmoidLossOpKernel<double>);
REGISTER_OP_CPU_KERNEL(teacher_student_sigmoid_loss_grad,
ops::TeacherStudentSigmoidLossGradOpKernel<float>,
ops::TeacherStudentSigmoidLossGradOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Tensor* y = context.Output<Tensor>("Y");
const Tensor* x = context.Input<Tensor>("X");
const Tensor* labels = context.Input<Tensor>("Label");
T* y_data = y->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>();
const T* label_data = labels->data<T>();
int64_t batch_size = x->dims()[0];
// loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
// log(1 + exp(-abs(x)))
// z is click or not
// z' is value q of feed_fine
// label = {-2, -1, [0, 2]}
// when z' is not exist, clk = 0 : label = -2;
// when z' is not exist, clk = 1 : label = -1;
// when z' is exist , clk = 0 : label = 0 + z';
// when z' is exist , clk = 1 : label = 1 + z';
for (int i = 0; i < batch_size; ++i) {
if (label_data[i] < -1.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
log(1.0 + exp(-fabs(x_data[i])));
} else if (label_data[i] < 0.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
log(1.0 + exp(-fabs(x_data[i])));
} else if (label_data[i] < 1.0) {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) +
log(1.0 + exp(-fabs(x_data[i]))) +
(x_data[i] > 0 ? x_data[i] : 0.0) -
x_data[i] * label_data[i] +
log(1.0 + exp(-fabs(x_data[i])));
} else {
y_data[i] = (x_data[i] > 0 ? x_data[i] : 0.0) - x_data[i] +
log(1.0 + exp(-fabs(x_data[i]))) +
(x_data[i] > 0 ? x_data[i] : 0.0) -
x_data[i] * (label_data[i] - 1.0) +
log(1.0 + exp(-fabs(x_data[i])));
}
}
}
};
template <typename T>
class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
const T* x_data = x->data<T>();
Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
T* dx_data = dx->mutable_data<T>(context.GetPlace());
const Tensor* labels = context.Input<Tensor>("Label");
const T* label_data = labels->data<T>();
T soft_max_up_bound =
static_cast<T>(context.Attr<float>("soft_max_up_bound"));
T soft_max_lower_bound =
static_cast<T>(context.Attr<float>("soft_max_lower_bound"));
int64_t batch_size = x->dims()[0];
const framework::Tensor* dOut =
context.Input<framework::Tensor>(framework::GradVarName("Y"));
const T* dout_data = dOut->data<T>();
for (int i = 0; i < batch_size; ++i) {
T sum_val = x_data[i];
if (sum_val > soft_max_up_bound) {
sum_val = soft_max_up_bound;
} else {
if (sum_val < soft_max_lower_bound) {
sum_val = soft_max_lower_bound;
}
}
T pred = 1.0 / (1.0 + exp(-sum_val));
if (label_data[i] < -1.0) {
dx_data[i] = 0.0 - pred;
} else if (label_data[i] < 0.0) {
dx_data[i] = 1.0 - pred;
} else {
dx_data[i] = label_data[i] - 2.0 * pred;
}
if (sum_val >= soft_max_up_bound || sum_val <= soft_max_lower_bound) {
dx_data[i] = 0;
}
dx_data[i] *= dout_data[i] * -1;
}
}
};
} // namespace operators
} // namespace paddle
......@@ -15,6 +15,9 @@
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#ifdef _WIN32
#include <numeric>
#endif
#include <random>
#define PADDLE_CUDA_FP16
......
......@@ -61,7 +61,7 @@ namespace platform {
* the allocations of temp_allocation_queue:
* - when the Stream calls cudaStreamSynchronize;
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_tmp_allocation).
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class DeviceTemporaryAllocator {
......
......@@ -59,7 +59,7 @@ limitations under the License. */
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) /*do nothing*/
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
namespace paddle {
......
......@@ -271,11 +271,13 @@ TEST(float16, isinf) {
float16 b = float16(INFINITY);
// underflow to 0
float16 native_a(5e-40f);
// overflow to inf
float16 native_b(5e40f);
EXPECT_EQ(std::isinf(a), true);
EXPECT_EQ(std::isinf(b), true);
#ifndef _WIN32
// overflow to inf
float16 native_b(5e40f);
EXPECT_EQ(std::isinf(native_b), true);
#endif
EXPECT_EQ(native_a, float16(0));
}
......
......@@ -15,15 +15,8 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64(limit_of_tmp_allocation, -1,
"The up limit of temporary_allocation size.");
DEFINE_double(times_excess_than_required_tmp_allocation, 2,
"times_excess_than_required_tmp_allocation indicates the "
"max size the TemporaryAllocator can return. For example, "
"if the required memory size is N, and "
"times_excess_than_required_tmp_allocation is 2.0, "
"the TemporaryAllocator will return the available allocation "
"that the range of size is N ~ 2*N.");
DEFINE_double(limit_of_temporary_allocation, -1,
"The up limit of temporary_allocation size.");
namespace paddle {
namespace platform {
......@@ -36,25 +29,24 @@ TemporaryAllocation::TemporaryAllocation(
underlying_allocation_(std::move(underlying_allocation)) {}
TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
}
bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
void TemporaryAllocator::Release(const std::function<void()> &callback) {
std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
std::shared_ptr<std::deque<TemporaryAllocation *>> t_allocations;
{
std::unique_lock<std::mutex> lock(mtx_);
callback();
t_allocations.swap(temp_mem_map_);
temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
t_allocations = temp_mem_queue_;
temp_mem_queue_.reset(new std::deque<TemporaryAllocation *>());
wait_delete_mem_ = 0;
}
for (auto tmp : *t_allocations) {
VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
<< " size: " << tmp.second->size();
delete tmp.second;
VLOG(10) << "Delete temporary allocation " << tmp->ptr()
<< " size: " << tmp->size();
delete tmp;
}
}
......@@ -62,34 +54,28 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
PADDLE_ENFORCE_NOT_NULL(temp_allocation);
if (platform::is_gpu_place(temp_allocation->place())) {
PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
"The place should be the same.");
size_t wait_delete_mem = 0;
{
std::unique_lock<std::mutex> lock(mtx_);
temp_mem_map_->emplace(temp_allocation->size(), temp_allocation);
temp_mem_queue_->emplace_back(temp_allocation);
wait_delete_mem_ += temp_allocation->size();
wait_delete_mem = wait_delete_mem_;
VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr()
<< " to delete queue: " << temp_allocation->size() << "; "
<< "wait_delete_mem: " << wait_delete_mem;
<< "wait_delete_mem: " << wait_delete_mem_;
}
if (FLAGS_limit_of_tmp_allocation > 0 &&
wait_delete_mem > static_cast<size_t>(FLAGS_limit_of_tmp_allocation)) {
PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized.");
if (FLAGS_limit_of_temporary_allocation > 0 &&
wait_delete_mem > FLAGS_limit_of_temporary_allocation) {
Release(callback_);
}
return;
}
VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
<< " size: " << temp_allocation->size();
delete temp_allocation;
}
size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
std::unique_lock<std::mutex> lock(mtx_);
return temp_mem_map_ ? temp_mem_map_->size() : 0;
return temp_mem_queue_ ? temp_mem_queue_->size() : 0;
}
void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
......@@ -98,27 +84,6 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
alloc::Allocation *TemporaryAllocator::AllocateImpl(
size_t size, alloc::Allocator::Attr attr) {
{
// Find available allocation in temp_mem_map.
std::unique_lock<std::mutex> lock(mtx_);
if (temp_mem_map_->size()) {
auto it = temp_mem_map_->lower_bound(size);
// FIXME(zcd): Not sure the best value of excess fraction.
if (it != temp_mem_map_->end() &&
it->first <
static_cast<size_t>(
size * FLAGS_times_excess_than_required_tmp_allocation)) {
auto tmp_ptr = it->second;
temp_mem_map_->erase(it);
wait_delete_mem_ -= tmp_ptr->size();
VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": "
<< tmp_ptr->size();
return tmp_ptr;
}
}
}
// If not find the the available allocation, get allocation from
// AllocatorFacadeInstance.
auto raw_allocation =
alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
......
......@@ -15,7 +15,6 @@
#pragma once
#include <condition_variable> // NOLINT
#include <deque>
#include <map>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
......@@ -40,7 +39,7 @@ class TemporaryAllocation : public memory::allocation::Allocation {
*
* There is one opportunity to free the allocations of temp_allocation_queue:
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_tmp_allocation).
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class TemporaryAllocator : public memory::allocation::Allocator {
......@@ -63,10 +62,11 @@ class TemporaryAllocator : public memory::allocation::Allocator {
private:
platform::Place place_;
// When the allocation is not held by any variable, it should be placed
// to temp_mem_map immediately.
std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
nullptr};
// to temp_mem_queue immediately.
std::shared_ptr<std::deque<TemporaryAllocation *>> temp_mem_queue_{nullptr};
std::mutex mtx_;
size_t wait_delete_mem_{0};
std::function<void()> callback_;
......
......@@ -18,8 +18,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
DECLARE_int64(limit_of_tmp_allocation);
DECLARE_double(times_excess_than_required_tmp_allocation);
DECLARE_double(limit_of_temporary_allocation);
namespace paddle {
namespace platform {
......@@ -36,7 +35,7 @@ class DummyOp : public framework::OperatorBase {
const platform::Place& place) const override {}
};
TEST(temporary_allocator, test_base_function) {
TEST(temporary_allocator, temporary_allocator) {
platform::CPUPlace cpu_place;
TemporaryAllocator alloc(cpu_place);
alloc.Allocate(100);
......@@ -60,10 +59,10 @@ TEST(temporary_allocator, test_base_function) {
#endif
}
TEST(temporary_allocator, test_flags_function) {
TEST(temporary_allocator, add_callback) {
#ifdef PADDLE_WITH_CUDA
const int64_t limit = FLAGS_limit_of_tmp_allocation;
FLAGS_limit_of_tmp_allocation = 10;
const double limit = FLAGS_limit_of_temporary_allocation;
FLAGS_limit_of_temporary_allocation = 10;
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
......@@ -79,52 +78,7 @@ TEST(temporary_allocator, test_flags_function) {
});
{ gpu_alloc.Allocate(100); }
PADDLE_ENFORCE(deleted);
FLAGS_limit_of_tmp_allocation = limit;
#endif
}
TEST(temporary_allocator, test_reuse_tmp_allocation) {
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
gpu_alloc.SetCallback([]() {});
void* tmp_allocation_ptr1 = nullptr;
{
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
auto tmp_allocation1 = gpu_alloc.Allocate(100);
tmp_allocation_ptr1 = tmp_allocation1->ptr();
}
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
auto tmp_allocation2 = gpu_alloc.Allocate(100);
void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
auto tmp_allocation3 = gpu_alloc.Allocate(100);
void* tmp_allocation_ptr3 = tmp_allocation2->ptr();
PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3);
#endif
}
TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) {
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
gpu_alloc.SetCallback([]() {});
double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation;
void* tmp_allocation_ptr1 = nullptr;
{
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
auto tmp_allocation1 =
gpu_alloc.Allocate(static_cast<size_t>(100 * excess_fraction - 1));
tmp_allocation_ptr1 = tmp_allocation1->ptr();
}
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
auto tmp_allocation2 = gpu_alloc.Allocate(100);
void* tmp_allocation_ptr2 = tmp_allocation2->ptr();
PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2);
FLAGS_limit_of_temporary_allocation = limit;
#endif
}
......
......@@ -22,9 +22,8 @@ if(WITH_PYTHON)
endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
endif(WITH_AMD_GPU)
if(WIN32)
target_link_libraries(paddle_pybind shlwapi)
endif(WIN32)
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules})
cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
endif(WITH_PYTHON)
......@@ -26,7 +26,9 @@ void BindTracer(pybind11::module *m) {
[](imperative::Tracer &self, framework::BlockDesc *root_block) {
new (&self) imperative::Tracer(root_block);
})
.def("trace", &imperative::Tracer::Trace);
.def("trace", &imperative::Tracer::Trace)
.def("py_trace", &imperative::Tracer::PyTrace,
pybind11::return_value_policy::take_ownership);
}
} // namespace pybind
......
......@@ -22,7 +22,7 @@ limitations under the License. */
namespace paddle {
namespace pybind {
class PyLayer : public imperative::Layer {
class Layer : public imperative::Layer {
public:
using imperative::Layer::Layer; // Inherit constructors
......@@ -31,10 +31,6 @@ class PyLayer : public imperative::Layer {
PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
inputs); // NOLINT
}
void Backward() override {
PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT
}
};
class PyOpBase : public imperative::OpBase {
......
......@@ -161,16 +161,44 @@ PYBIND11_MODULE(core, m) {
self.op_desc_ = op_desc;
}
},
py::return_value_policy::reference)
.def_property(
"forward_id",
[](const imperative::OpBase &self) { return self.forward_id_; },
[](imperative::OpBase &self, int forward_id) {
self.forward_id_ = forward_id;
},
py::return_value_policy::reference)
.def_property(
"backward_id",
[](const imperative::OpBase &self) { return self.backward_id_; },
[](imperative::OpBase &self, int backward_id) {
self.backward_id_ = backward_id;
},
py::return_value_policy::reference);
py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>())
.def("forward",
[](imperative::Layer &self,
const std::vector<imperative::VarBase> &inputs) {
return self.Forward(inputs);
})
.def("backward", &imperative::Layer::Backward);
.def("forward", [](imperative::Layer &self,
const std::vector<imperative::VarBase> &inputs) {
return self.Forward(inputs);
});
py::class_<imperative::PyLayer>(m, "PyLayer")
.def(py::init<>())
.def_static(
"apply",
[](int func_id, const std::vector<imperative::VarBase *> &inputs)
-> std::vector<imperative::VarBase *> {
return imperative::PyLayer::Apply(func_id, inputs);
},
py::return_value_policy::take_ownership)
.def_static("register_func",
[](int func_id, const py::object &callable) {
imperative::PyLayer::RegisterFunc(func_id, callable);
})
.def_static("num_funcs", &imperative::PyLayer::NumFuncs);
BindTracer(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
......
......@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
from __future__ import print_function
import paddle.dataset.common
import subprocess
import gzip
import numpy
import platform
import tempfile
import struct
from six.moves import range
__all__ = ['train', 'test', 'convert']
......@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def reader_creator(image_filename, label_filename, buffer_size):
def reader():
if platform.system() == 'Darwin':
zcat_cmd = 'gzcat'
elif platform.system() == 'Linux':
zcat_cmd = 'zcat'
else:
raise NotImplementedError()
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
m = subprocess.Popen(
[zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
tmp_image_file.seek(16) # skip some magic bytes
# Python3 will not take stdout as file
tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
l = subprocess.Popen(
[zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
tmp_label_file.seek(8) # skip some magic bytes
try: # reader could be break.
while True:
labels = numpy.fromfile(
tmp_label_file, 'ubyte', count=buffer_size).astype("int")
if labels.size != buffer_size:
break # numpy.fromfile returns empty slice after EOF.
images = numpy.fromfile(
tmp_image_file, 'ubyte', count=buffer_size * 28 *
28).reshape((buffer_size, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
for i in range(buffer_size):
yield images[i, :], int(labels[i])
finally:
try:
m.terminate()
except:
pass
try:
l.terminate()
except:
pass
with gzip.GzipFile(image_filename, 'rb') as image_file:
img_buf = image_file.read()
with gzip.GzipFile(label_filename, 'rb') as label_file:
lab_buf = label_file.read()
step_label = 0
offset_img = 0
# read from Big-endian
# get file info from magic byte
# image file : 16B
magic_byte_img = '>IIII'
magic_img, image_num, rows, cols = struct.unpack_from(
magic_byte_img, img_buf, offset_img)
offset_img += struct.calcsize(magic_byte_img)
offset_lab = 0
# label file : 8B
magic_byte_lab = '>II'
magic_lab, label_num = struct.unpack_from(magic_byte_lab,
lab_buf, offset_lab)
offset_lab += struct.calcsize(magic_byte_lab)
while True:
if step_label >= label_num:
break
fmt_label = '>' + str(buffer_size) + 'B'
labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
offset_lab += struct.calcsize(fmt_label)
step_label += buffer_size
fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
images_temp = struct.unpack_from(fmt_images, img_buf,
offset_img)
images = numpy.reshape(images_temp, (
buffer_size, rows * cols)).astype('float32')
offset_img += struct.calcsize(fmt_images)
images = images / 255.0 * 2.0 - 1.0
for i in range(buffer_size):
yield images[i, :], int(labels[i])
return reader
......
......@@ -155,8 +155,7 @@ def __bootstrap__():
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'sync_nccl_allreduce', 'limit_of_tmp_allocation',
'times_excess_than_required_tmp_allocation'
'sync_nccl_allreduce'
]
core.init_gflags([sys.argv[0]] +
......
......@@ -20,10 +20,12 @@ from paddle.fluid import core
from paddle.fluid import framework
from paddle.fluid.imperative import base
__all__ = ['PyLayer']
__all__ = ['Layer', 'PyLayer']
class PyLayer(core.Layer):
class Layer(core.Layer):
"""Layers composed of operators."""
def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
self._once_built = False
self._dtype = dtype
......@@ -37,8 +39,56 @@ class PyLayer(core.Layer):
self._once_built = True
outputs = self.forward(*inputs)
return outputs
def forward(self, *inputs):
raise NotImplementedError
def backward(self, *inputs):
raise ValueError("Layer shouldn't implement backward")
class PyLayer(core.PyLayer):
"""Layers composed of user-defined python codes."""
def __init__(self):
super(PyLayer, self).__init__()
@staticmethod
def forward(inputs):
raise NotImplementedError
@staticmethod
def backward(douts):
raise NotImplementedError
@classmethod
def __call__(cls, inputs):
tracer = framework._imperative_tracer()
block = framework.default_main_program().current_block()
inputs = [x._ivar for x in inputs]
if not hasattr(cls, 'forward_id'):
cls.forward_id = core.PyLayer.num_funcs() + 1
PyLayer.register_func(cls.forward_id, cls.forward)
cls.backward_id = core.PyLayer.num_funcs() + 1
PyLayer.register_func(cls.backward_id, cls.backward)
iop = core.OpBase()
iop.forward_id = cls.forward_id
iop.backward_id = cls.backward_id
block.ops.append(iop)
ivars = tracer.py_trace(iop, inputs, False)
# ivars = core.PyLayer.apply(cls.forward, inputs)
ret = []
for ivar in ivars:
tensor = ivar.value.get_tensor()
py_var = framework.Variable(
block,
type=core.VarDesc.VarType.LOD_TENSOR,
name=None,
shape=tensor.shape(),
dtype=tensor._dtype(),
ivar=ivar)
ret.append(py_var)
return ret
......@@ -30,7 +30,7 @@ __all__ = [
]
class Conv2D(layers.PyLayer):
class Conv2D(layers.Layer):
def __init__(self,
num_channels,
num_filters,
......@@ -143,7 +143,7 @@ class Conv2D(layers.PyLayer):
return self._helper.append_activation(pre_act)
class Pool2D(layers.PyLayer):
class Pool2D(layers.Layer):
def __init__(self,
pool_size=-1,
pool_type="max",
......@@ -205,7 +205,7 @@ class Pool2D(layers.PyLayer):
return pool_out
class FC(layers.PyLayer):
class FC(layers.Layer):
def __init__(self,
size,
param_attr=None,
......
......@@ -58,6 +58,7 @@ __all__ = [
'adaptive_pool2d',
'adaptive_pool3d',
'batch_norm',
'data_norm',
'beam_search_decode',
'conv2d_transpose',
'conv3d_transpose',
......@@ -180,6 +181,7 @@ __all__ = [
'lstm',
'py_func',
'psroi_pool',
'teacher_student_sigmoid_loss',
'huber_loss',
]
......@@ -2896,6 +2898,133 @@ def batch_norm(input,
return helper.append_activation(batch_norm_out)
def data_norm(input,
act=None,
epsilon=1e-05,
param_attr=None,
data_layout='NCHW',
in_place=False,
use_mkldnn=False,
name=None,
moving_mean_name=None,
moving_variance_name=None,
do_model_average_for_mean_and_var=False):
"""
**Data Normalization Layer**
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
:math:`input` is the input features over a mini-batch.
.. math::
\\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
\ mini-batch\ mean \\\\
\\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
\\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
Args:
input(variable): The input variable which is a LoDTensor.
act(string, Default None): Activation type, linear|relu|prelu|...
epsilon(float, Default 1e-05):
param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
data_layout(string, default NCHW): NCHW|NHWC
in_place(bool, Default False): Make the input and output of batch norm reuse memory.
use_mkldnn(bool, Default false): ${use_mkldnn_comment}
name(string, Default None): A name for this layer(optional). If set None, the layer
will be named automatically.
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
Returns:
Variable: A tensor variable which is the result after applying data normalization on the input.
Examples:
.. code-block:: python
data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.data_norm(input=hidden1)
"""
helper = LayerHelper('data_norm', **locals())
dtype = helper.input_dtype()
input_shape = input.shape
if data_layout == 'NCHW':
channel_num = input_shape[1]
else:
if data_layout == 'NHWC':
channel_num = input_shape[-1]
else:
raise ValueError("unsupported data layout:" + data_layout)
param_shape = [channel_num]
batch_size_default = 1e4
batch_sum_default = 0.0
batch_square_sum_default = 1e4
if param_attr and isinstance(param_attr, dict):
batch_size_default = param_attr.get("batch_size", 1e4)
batch_sum_default = param_attr.get("batch_sum", 0.0)
batch_square_sum_default = param_attr.get("batch_square", 1e4)
# create parameter
batch_size = helper.create_parameter(
attr=ParamAttr(
name=name + '.batch_size',
initializer=Constant(value=float(batch_size_default)),
trainable=True),
shape=param_shape,
dtype=input.dtype)
batch_sum = helper.create_parameter(
attr=ParamAttr(
name=name + '.batch_sum',
initializer=Constant(value=float(batch_sum_default)),
trainable=True),
shape=param_shape,
dtype=input.dtype)
batch_square_sum = helper.create_parameter(
attr=ParamAttr(
name=name + '.batch_square_sum',
initializer=Constant(value=float(batch_square_sum_default)),
trainable=True),
shape=param_shape,
dtype=input.dtype)
means = helper.create_variable(dtype=dtype, stop_gradient=True)
scales = helper.create_variable(dtype=dtype, stop_gradient=True)
data_norm_out = input if in_place else helper.create_variable(dtype=dtype)
helper.append_op(
type="data_norm",
inputs={
"X": input,
"BatchSize": batch_size,
"BatchSum": batch_sum,
"BatchSquareSum": batch_square_sum
},
outputs={"Y": data_norm_out,
"Means": means,
"Scales": scales},
attrs={"epsilon": epsilon,
"use_mkldnn": use_mkldnn})
return helper.append_activation(data_norm_out)
@templatedoc()
def layer_norm(input,
scale=True,
......@@ -3064,9 +3193,9 @@ def group_norm(input,
inputs['Bias'] = bias
# create output
mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_tmp_variable(dtype)
mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_variable(dtype)
helper.append_op(
type="group_norm",
......@@ -9264,6 +9393,47 @@ def log_loss(input, label, epsilon=1e-4, name=None):
return loss
def teacher_student_sigmoid_loss(input,
label,
soft_max_up_bound=15.0,
soft_max_lower_bound=-15.0):
"""
**Teacher Student Log Loss Layer**
This layer accepts input predictions and target label and returns the
teacher_student loss.
.. math::
loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
Args:
input (Variable|list): a 2-D tensor with shape [N x 1], where N is the
batch size. This input is a probability computed
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
soft_max_up_bound (float): if input > soft_max_up_bound, will be bound
soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
Returns:
Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
Examples:
.. code-block:: python
cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
"""
helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
out = helper.create_variable(dtype=input.dtype)
helper.append_op(
type='teacher_student_sigmoid_loss',
inputs={'X': [input],
'Label': [label]},
outputs={'Y': [out]},
attrs={"soft_max_lower_bound": float(soft_max_lower_bound), \
"soft_max_up_bound": float(soft_max_up_bound)})
return out
def add_position_encoding(input, alpha, beta, name=None):
"""
**Add Position Encoding Layer**
......
......@@ -15,6 +15,7 @@
import contextlib
import unittest
import numpy as np
import sys
import paddle.fluid as fluid
from paddle.fluid import core
......@@ -22,7 +23,7 @@ from paddle.fluid.imperative.nn import FC
from test_imperative_base import new_program_scope
class MyLayer(fluid.imperative.PyLayer):
class MyLayer(fluid.imperative.Layer):
def __init__(self):
super(MyLayer, self).__init__()
......@@ -34,7 +35,35 @@ class MyLayer(fluid.imperative.PyLayer):
return [x]
class MLP(fluid.imperative.PyLayer):
class MyPyLayer(fluid.imperative.PyLayer):
def __init__(self):
super(MyPyLayer, self).__init__()
@staticmethod
def forward(inputs):
sys.stderr.write('before forward\n')
ret = np.tanh(inputs[0])
sys.stderr.write('after forward: %s\n' % ret)
tensor = core.LoDTensor()
tensor.set(ret, core.CPUPlace())
return tuple([tensor])
@staticmethod
def backward(inputs):
sys.stderr.write('calling into backward: %s\n' % str(inputs))
inp, out, dout = inputs
inp = np.array(inp)
out = np.array(out)
dout = np.array(dout)
sys.stderr.write('calling into backward: %s, %s, %s\n' %
(inp, out, dout))
ret = np.array(dout) * (1 - np.square(np.array(out)))
tensor = core.LoDTensor()
tensor.set(ret, core.CPUPlace())
return tuple([tensor])
class MLP(fluid.imperative.Layer):
def __init__(self):
super(MLP, self).__init__()
self._fc1 = FC(3,
......@@ -56,9 +85,77 @@ class TestImperative(unittest.TestCase):
with fluid.imperative.guard():
cl = core.Layer()
cl.forward([])
l = fluid.imperative.PyLayer()
l = fluid.imperative.Layer()
self.assertRaises(NotImplementedError, l.forward, [])
def test_pylayer_func_id(self):
with fluid.imperative.guard():
class PyLayer1(fluid.imperative.PyLayer):
def __init__(self):
super(PyLayer1, self).__init__()
@staticmethod
def forward(inputs):
return inputs
@staticmethod
def backward(inputs):
return inputs
class PyLayer2(fluid.imperative.PyLayer):
def __init__(self):
super(PyLayer2, self).__init__()
@staticmethod
def forward(inputs):
return inputs
@staticmethod
def backward(inputs):
return inputs
py_layer_1 = PyLayer1()
py_layer_2 = PyLayer2()
py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))])
py_layer_2([fluid.imperative.base.to_variable(np.ones([2, 2]))])
id = py_layer_1.forward_id
self.assertGreater(id, 0)
self.assertEqual(py_layer_1.backward_id, id + 1)
self.assertEqual(py_layer_2.forward_id, id + 2)
self.assertEqual(py_layer_2.backward_id, id + 3)
py_layer_1([fluid.imperative.base.to_variable(np.ones([2, 2]))])
self.assertEqual(py_layer_1.forward_id, id)
def test_pylayer(self):
np_inp = np.ones([2, 2], np.float32)
with fluid.imperative.guard():
my_py_layer = MyPyLayer()
var_inp = fluid.imperative.base.to_variable(np_inp)
outs = my_py_layer([var_inp])
dy_out = np.sum(outs[0]._numpy())
outs[0]._backward()
dy_grad = var_inp._gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[2, 2], append_batch_size=False)
# TODO(panyx0718): Paddle doesn't diff against data `inp`.
x1 = inp * 1
# TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
param_grads = fluid.backward.append_backward(
x, parameter_list=[x1.name])[0]
exe = fluid.Executor(fluid.CPUPlace())
static_out, static_grad = exe.run(
feed={inp.name: np_inp},
fetch_list=[x.name, param_grads[1].name])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
def test_layer_in_out(self):
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
with fluid.imperative.guard():
......
......@@ -26,7 +26,7 @@ from paddle.fluid.imperative.base import to_variable
from test_imperative_base import new_program_scope
class SimpleImgConvPool(fluid.imperative.PyLayer):
class SimpleImgConvPool(fluid.imperative.Layer):
def __init__(self,
num_channels,
num_filters,
......@@ -72,7 +72,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
return x
class MNIST(fluid.imperative.PyLayer):
class MNIST(fluid.imperative.Layer):
def __init__(self, param_attr=None, bias_attr=None):
super(MNIST, self).__init__()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from math import log
from math import exp
from op_test import OpTest
from scipy.special import logit
from scipy.special import expit
import unittest
class TestTeacherStudentSigmoidLossOp(OpTest):
"""
Test teacher_student_sigmoid_loss with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "teacher_student_sigmoid_loss"
batch_size = 16
num_classes = 1
self.inputs = {
'X': logit(
np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32")),
'Label': np.random.uniform(0, 2, (batch_size, num_classes))
.astype("float32")
}
outs = []
for index, label in enumerate(self.inputs["Label"]):
x = self.inputs["X"][index]
if label < -1.0:
outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))))
elif label < 0.0:
outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))))
elif label < 1.0:
outs.append(max(x, 0.0) + log(1.0 + exp(-abs(x))) + \
max(x, 0.0) - x * label + log(1.0 + exp(-abs(x))))
else:
outs.append(max(x, 0.0) - x + log(1.0 + exp(-abs(x))) + \
max(x, 0.0) - x * (label - 1.0) + log(1.0 + exp(-abs(x))))
self.outputs = {'Y': np.array(outs)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.005)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册