diff --git a/.gitignore b/.gitignore index ac56a3320ec85769d2c87c072512f5217eca0c24..fe0d13f4d9eab2c2a8e7001c9ecb69cce1333af1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +paddle/operators/check_t.save +paddle/operators/check_tensor.ls +paddle/operators/tensor.save +python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/ +python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/ +python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/ *.DS_Store build/ build_doc/ @@ -27,5 +33,5 @@ CMakeFiles cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ -paddle/pybind/pybind.h +paddle/fluid/pybind/pybind.h python/paddle/version.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3..3c36cffcb4eeaaf7f8cff5167777628dd2697e7d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,8 @@ # Contribute Code +You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the +[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329). + We sincerely appreciate your contribution. This document explains our workflow and work style. ## Workflow diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 6bea7cf3022242ce48cc882915f7e71810937283..de94bd5008effef1bf0fd3a125d4aed56e1b7f81 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index e24613b94b422b7cdf9c6383c359fa92a4faf6ff..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -323,6 +323,12 @@ batch_norm .. autofunction:: paddle.v2.fluid.layers.batch_norm :noindex: +layer_norm +---------- + +.. autofunction:: paddle.v2.fluid.layers.layer_norm + :noindex: + beam_search_decode ------------------ diff --git a/doc/index_cn.rst b/doc/index_cn.rst index 63a78428583477792e309a3b3d26af340caccfca..0f645db6fc5d0f84bbe0cbb335677752e3a355ea 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -8,5 +8,4 @@ PaddlePaddle 文档 build_and_install/index_cn.rst howto/index_cn.rst dev/index_cn.rst - api/index_cn.rst faq/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 5631381be087017c26b2a6a3984b3c5bdb49f12d..166f56c28f464563a0b36007f58cebb58c286916 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -8,4 +8,3 @@ PaddlePaddle Documentation build_and_install/index_en.rst howto/index_en.rst dev/index_en.rst - api/index_en.rst diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 3f9c132ef6ae03c7614e10484715676c8019821e..c7deba2ab475d3c4f2c95327af77af7031b591fd 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -19,12 +19,7 @@ else() endif() if(NOT ANDROID AND NOT IOS) - add_subdirectory(memory) - add_subdirectory(platform) - add_subdirectory(framework) - add_subdirectory(operators) - add_subdirectory(pybind) - add_subdirectory(inference) + add_subdirectory(fluid) endif() if(WITH_SWIG_PY) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b4191518c45d0579f800ecb901dcd9667e17d5 --- /dev/null +++ b/paddle/fluid/CMakeLists.txt @@ -0,0 +1,6 @@ +add_subdirectory(memory) +add_subdirectory(platform) +add_subdirectory(framework) +add_subdirectory(operators) +add_subdirectory(pybind) +add_subdirectory(inference) diff --git a/paddle/framework/.clang-format b/paddle/fluid/framework/.clang-format similarity index 100% rename from paddle/framework/.clang-format rename to paddle/fluid/framework/.clang-format diff --git a/paddle/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt similarity index 100% rename from paddle/framework/CMakeLists.txt rename to paddle/fluid/framework/CMakeLists.txt diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d7e7366b0723c630b24d62c1f5d0a72cf42d770 --- /dev/null +++ b/paddle/fluid/framework/attribute.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/attribute.h" + +#include + +namespace paddle { +namespace framework { + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { + switch (attr_desc.type()) { + case proto::AttrType::BOOLEAN: { + return attr_desc.b(); + } + case proto::AttrType::INT: { + return attr_desc.i(); + } + case proto::AttrType::FLOAT: { + return attr_desc.f(); + } + case proto::AttrType::STRING: { + return attr_desc.s(); + } + case proto::AttrType::BOOLEANS: { + std::vector val(attr_desc.bools_size()); + for (int i = 0; i < attr_desc.bools_size(); ++i) { + val[i] = attr_desc.bools(i); + } + return val; + } + case proto::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + case proto::AttrType::FLOATS: { + std::vector val(attr_desc.floats_size()); + for (int i = 0; i < attr_desc.floats_size(); ++i) { + val[i] = attr_desc.floats(i); + } + return val; + } + case proto::AttrType::STRINGS: { + std::vector val(attr_desc.strings_size()); + for (int i = 0; i < attr_desc.strings_size(); ++i) { + val[i] = attr_desc.strings(i); + } + return val; + } + case proto::AttrType::LONG: { + return attr_desc.l(); + } + default: + PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); + } + return boost::blank(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h new file mode 100644 index 0000000000000000000000000000000000000000..16be42ae71497bcc755d10eee2d73d331ede7da6 --- /dev/null +++ b/paddle/fluid/framework/attribute.h @@ -0,0 +1,284 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +template +inline proto::AttrType AttrTypeID() { + Attribute tmp = T(); + return static_cast(tmp.which() - 1); +} + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc); + +class AttrReader { + public: + explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {} + + template + inline const T& Get(const std::string& name) const { + PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", + name); + return boost::get(attrs_.at(name)); + } + + private: + const AttributeMap& attrs_; +}; + +// check whether a value(attribute) fit a certain limit +template +class GreaterThanChecker { + public: + explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(T& value) const { + PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); + } + + private: + T lower_bound_; +}; + +template +class EqualGreaterThanChecker { + public: + explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(T& value) const { + PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); + } + + private: + T lower_bound_; +}; + +// we can provide users more common Checker, like 'LessThanChecker', +// 'BetweenChecker'... + +template +class DefaultValueSetter { + public: + explicit DefaultValueSetter(T default_value) + : default_value_(default_value) {} + void operator()(T& value) const { value = default_value_; } + + private: + T default_value_; +}; + +template +class EnumInContainer { + public: + explicit EnumInContainer(const std::unordered_set& c) : container_(c) {} + void operator()(T& val) const { + PADDLE_ENFORCE(container_.find(val) != container_.end(), + "Value %s is not in enum container %s", val, + ContainerDebugString()); + } + + private: + std::string ContainerDebugString() const { + std::ostringstream sout; + sout << "["; + size_t cnt = 0; + for (auto& v : container_) { + sout << v; + ++cnt; + if (cnt != container_.size()) { + sout << " ,"; + } + } + sout << "]"; + return sout.str(); + } + + std::unordered_set container_; +}; + +template +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + T* operator()(Attribute& attr) const { + T* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s", + attr_name_, typeid(T).name(), attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// special handle bool +// FIXME(yuyang18): Currently we cast bool into int in python binding. It is +// hard to change the logic there. In another way, we should correct handle +// if the user set `some_flag=1`. +// +// FIX ME anytime if there is a better solution. +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + bool* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + float val = boost::get(attr); + attr = static_cast(val); + } + bool* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s", + attr_name_, attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + int64_t* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } + int64_t* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// check whether a certain attribute fit its limits +// an attribute can have more than one limits +template +class TypedAttrChecker { + typedef std::function ValueChecker; + + public: + explicit TypedAttrChecker(const std::string& attr_name) + : attr_name_(attr_name) {} + + TypedAttrChecker& InEnum(const std::unordered_set& range) { + value_checkers_.push_back(EnumInContainer(range)); + return *this; + } + + TypedAttrChecker& GreaterThan(const T& lower_bound) { + value_checkers_.push_back(GreaterThanChecker(lower_bound)); + return *this; + } + + TypedAttrChecker& EqualGreaterThan(const T& lower_bound) { + value_checkers_.push_back(EqualGreaterThanChecker(lower_bound)); + return *this; + } + + // we can add more common limits, like LessThan(), Between()... + + TypedAttrChecker& SetDefault(const T& default_value) { + PADDLE_ENFORCE(default_value_setter_.empty(), + "%s can't have more than one default value!", attr_name_); + default_value_setter_.push_back(DefaultValueSetter(default_value)); + return *this; + } + + // allow users provide their own checker + TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) { + value_checkers_.push_back(checker); + return *this; + } + + void operator()(AttributeMap& attr_map) const { + if (!attr_map.count(attr_name_)) { + // user do not set this attr + PADDLE_ENFORCE(!default_value_setter_.empty(), + "Attribute '%s' is required!", attr_name_); + // default_value_setter_ has no more than one element + T val; + (default_value_setter_[0])(val); + attr_map[attr_name_] = val; + } + Attribute& attr = attr_map.at(attr_name_); + ExtractAttribute extract_attr(attr_name_); + T* attr_value = extract_attr(attr); + for (const auto& checker : value_checkers_) { + checker(*attr_value); + } + } + + private: + std::string attr_name_; + std::vector value_checkers_; + std::vector default_value_setter_; +}; + +// check whether op's all attributes fit their own limits +class OpAttrChecker { + typedef std::function AttrChecker; + + public: + template + TypedAttrChecker& AddAttrChecker(const std::string& attr_name) { + attr_checkers_.push_back(TypedAttrChecker(attr_name)); + AttrChecker& checker = attr_checkers_.back(); + return *(checker.target>()); + } + + void Check(AttributeMap& attr_map) const { + for (const auto& checker : attr_checkers_) { + checker(attr_map); + } + } + + private: + std::vector attr_checkers_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4795f4fc5c73034b23305162ea3b710480d8ebc --- /dev/null +++ b/paddle/fluid/framework/backward.cc @@ -0,0 +1,585 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/backward.h" +#include "paddle/fluid/operators/net_op.h" + +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace framework { + +static std::unordered_set* g_ctrl_flow_ops_ = nullptr; +// Control Flow operators's backward is significantly different from +// computational operators. Hack Code here. +// We should design a better way to backward CtrlFlowOps. +static std::unordered_set& CtrlFlowOps() { + if (g_ctrl_flow_ops_ == nullptr) { + g_ctrl_flow_ops_ = new std::unordered_set{ + "increment", "lod_rank_table", "less_than"}; + } + return *g_ctrl_flow_ops_; +} + +static inline std::unique_ptr CreateGradOp( + const OperatorBase& op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var) { + OpDesc op_desc; + op_desc.SetInputMap(op.Inputs()); + op_desc.SetOutputMap(op.Outputs()); + op_desc.SetType(op.Type()); + op_desc.SetAttrMap(op.Attrs()); + auto& info = OpInfoMap::Instance().Get(op.Type()); + auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {}); + std::vector> grad_ops; + grad_ops.reserve(grad_descs.size()); + std::transform(grad_descs.begin(), grad_descs.end(), + std::back_inserter(grad_ops), + [](const std::unique_ptr& grad_desc) { + return OpRegistry::CreateOp(*grad_desc); + }); + PADDLE_ENFORCE(!grad_ops.empty()); + if (grad_ops.size() == 1) { + return std::move(grad_ops[0]); + } else { + auto net_op = new operators::NetOp(); + for (auto& grad_op : grad_ops) { + net_op->AppendOp(std::move(grad_op)); + } + net_op->CompleteAddOp(); + return std::unique_ptr(net_op); + } +} + +template +static void ForEachVarName(const Map& names, T callback) { + for (auto& name : names) { + for (auto& n : name.second) { + if (callback(n)) return; + } + } +} + +// return whether all the names + suffixes in the set +static bool AllInSet( + const std::map>& names, + const std::string& suffix, const std::unordered_set& set) { + bool all_in_set = true; + ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) { + all_in_set = set.find(n + suffix) != set.end(); + return !all_in_set; + }); + return all_in_set; +} + +static std::unique_ptr NOP() { + auto net_op = new operators::NetOp(); + net_op->SetType("@NOP@"); + net_op->CompleteAddOp(); + return std::unique_ptr(net_op); +} + +// Get backward operator from a forward operator, a recursive implementation. +// +// no_grad_names the gradient variable names without gradient calculating. +// +// uniq_id is a unique index used inside recursively calling +// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and +// pass `uniq_id` through recursive calling. +// +// returns The backward operator. In a simple situation, it may be a simple +// operator, in a complex situation, it maybe a NetOp. +// +// See Backward.h for details +static std::unique_ptr BackwardRecursive( + const OperatorBase& forwardOp, + std::unordered_set& no_grad_names, + std::unordered_map* grad_to_var, + size_t& uniq_id) { + // If all input gradients of forwarding operator do not need to calculate, + // just return an NOP. Not return null ptr because NOP does not take + // too much time for calculation, but it is useful for simplifying logic. + if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { + return NOP(); + } + + // All output gradients of forwarding operator do not need to calculate. + // Then all input gradients cannot be computed at all, and we put them into + // `no_grad_names` set. Return an NOP. + if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { + ForEachVarName(forwardOp.Inputs(), + [&no_grad_names](const std::string& name) -> bool { + no_grad_names.insert(GradVarName(name)); + return false; + }); + return NOP(); + } + + // Returned gradient network + auto net = std::unique_ptr(new operators::NetOp()); + + if (forwardOp.IsNetOp()) { + // Because forwardOp is a net op, it can static_cast. + auto& forwardNet = static_cast(forwardOp); + + // Map from output gradient variable name to operator's indices in + // backward net's ops_. That operator generates that variable. + std::unordered_map> dup_output_ops; + + size_t local_op_id = 0; + // reversely travel forwardNet and collect all duplicate outputs. + for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend(); + ++it, ++local_op_id) { + auto& fwd = *it; + auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id); + ForEachVarName(bwd->Outputs(), + [&dup_output_ops, local_op_id](const std::string& out) { + dup_output_ops[out].emplace_back(local_op_id); + return false; + }); + net->AppendOp(std::move(bwd)); + } + // Get unique ID for this method. + auto uid = uniq_id++; + // TODO(dzh): more comment + // multiple operators which have the same output (y for example) may + // overwrite the same y variable when backward, special operations are token + // to handle this case. For each duplicate output, rename it to an alias + // (original name with a offset), append an `add` op for its operator, + // and finally sum all the alias variable to the final output variable y. + using Pos = std::pair>; + std::list insert_position; + for (auto& dup_output_op : dup_output_ops) { + const std::string& name = dup_output_op.first; + // duplicate @Empty@ don't need to be added + if (name == kEmptyVarName) continue; + + auto& dup_op = dup_output_op.second; + // no duplicate output + if (dup_op.size() == 1) continue; + + // process the duplicate outputs + std::vector dup_outputs; + for (size_t i = 0; i < dup_op.size(); ++i) { + // rename each duplicate output to an alias + auto op_offset = dup_op[i]; + dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" + + std::to_string(i)); + net->ops_[op_offset]->Rename(name, dup_outputs.back()); + } + // collect all the offset for each alias, + // insert a sum operator to add all aliases to output + insert_position.push_back( + {dup_op.back(), + OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}}, + AttributeMap{})}); + } + + // make sure the inserted `sum` ops follow the BFS order. + insert_position.sort( + [](const Pos& l, const Pos& r) { return l.first > r.first; }); + + for (auto& pos : insert_position) { + net->InsertOp(pos.first + 1, std::move(pos.second)); + } + } else { + std::unique_ptr grad_op( + CreateGradOp(forwardOp, no_grad_names, grad_to_var)); + + ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op]( + const std::string& grad_input) { + if (no_grad_names.count(grad_input)) { + // +1 for \0 + std::string prefix = grad_input.substr( + 0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); + grad_op->Rename(grad_input, prefix + kZeroVarSuffix); + + // If part of input gradient of that operator is not calculated, fill + // zero variables to that input gradient. + net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}}, + {{"Out", {grad_input}}}, + AttributeMap{})); + } + return false; + }); + + ForEachVarName(grad_op->Outputs(), + [&no_grad_names, &grad_op](const std::string& grad_output) { + if (no_grad_names.count(grad_output)) { + grad_op->Rename(grad_output, kEmptyVarName); + } + return false; + }); + + if (net->ops_.empty()) { // Current no aux op is added to network + return grad_op; + } + net->AppendOp(std::move(grad_op)); + } + net->SetType("@GENERATED_BACKWARD@"); + net->CompleteAddOp(); + return std::unique_ptr( + static_cast(net.release())); +} + +// See header for comments +std::unique_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars) { + std::unordered_set no_grad_names; + no_grad_names.reserve(no_grad_vars.size() + 1); + + no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix); + + for (auto& name : no_grad_vars) { + no_grad_names.insert(name + kGradVarSuffix); + } + size_t uid = 0; + std::unordered_map grad_to_var; + return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid); +} + +// ==================================== // + +static bool AllGradInSet(const std::vector& names, + const std::unordered_set& set) { + for (const std::string& name : names) { + if (!set.count(GradVarName(name))) { + return false; + } + } + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "All input {"; + for (auto& name : names) { + sout << name << ","; + } + sout << "} is in {"; + for (auto& name : set) { + sout << name << ","; + } + sout << "}"; + VLOG(10) << sout.str(); + } + return true; +} + +static std::string FwdName(const std::string& grad_name) { + auto pos = grad_name.find("@GRAD"); + if (pos == std::string::npos) { + return ""; + } else { + return grad_name.substr(0, pos); + } +} + +static void CreateGradVarInBlock( + size_t grad_op_start_index, + const std::unordered_map& param_name_map, + BlockDesc* block_desc, + std::unordered_map* grad_var_record) { + auto ops = block_desc->AllOps(); + for (size_t op_index = grad_op_start_index; op_index < ops.size(); + ++op_index) { + std::unordered_set new_vars; + auto& ctrl_flow_ops = CtrlFlowOps(); + ForEachVarName(ops[op_index]->Outputs(), + [&](const std::string& grad_var_name) { + if (ctrl_flow_ops.find(ops[op_index]->Type()) != + ctrl_flow_ops.end()) { + if (block_desc->HasVarRecursive(grad_var_name)) { + return false; + } + } else { + if (block_desc->HasVar(grad_var_name)) { + return false; + } + } + if (grad_var_name == framework::kEmptyVarName) { + return false; + } + auto var = block_desc->Var(grad_var_name); + VLOG(10) << "Creating Variable " << grad_var_name; + new_vars.insert(var->Name()); + auto it = param_name_map.find(grad_var_name); + if (it == param_name_map.end()) { + return false; + } + auto param_var_name = it->second; + auto& grad_record = (*grad_var_record)[param_var_name]; + grad_record.name_ = grad_var_name; + grad_record.block_idx_ = block_desc->ID(); + grad_record.op_idx_ = static_cast(op_index); + return false; /* not break */ + }); + ops[op_index]->InferVarType(block_desc); + for (auto& arg : ops[op_index]->OutputArgumentNames()) { + if (new_vars.find(arg) == new_vars.end()) { + continue; + } + auto pname = FwdName(arg); + auto* param = block_desc->FindVarRecursive(pname); + auto* grad = block_desc->FindVar(arg); + if (param == nullptr) { + grad->SetDataType(proto::DataType::FP32); + } else { + grad->SetDataType(param->GetDataType()); + } + } + ops[op_index]->InferShape(*block_desc); + } +} + +std::vector> MakeOpGrad( + const OpDesc* op_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + const std::vector& grad_block = std::vector()) { + std::vector> grad_op_descs; + // All input gradients of forwarding operator do not need to calculate. + const std::vector& inputs = op_desc->InputArgumentNames(); + if (AllGradInSet(inputs, *no_grad_vars)) { + VLOG(10) << "Drop operator " << op_desc->Type(); + return grad_op_descs; // empty vector + } + + // All output gradients of forwarding operator do not need to calculate. + const std::vector& outputs = op_desc->OutputArgumentNames(); + + if (AllGradInSet(outputs, *no_grad_vars)) { + VLOG(10) << "Drop operator " << op_desc->Type(); + // FIXME: Hack code here + auto& ctrl_flow_ops = CtrlFlowOps(); + if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) { + // Only computational op need drop input's gradient. + for (const std::string& name : inputs) { + no_grad_vars->insert(GradVarName(name)); + VLOG(10) << " Also drop " << GradVarName(name); + } + } + + return grad_op_descs; // empty vector + } + + grad_op_descs = + OpInfoMap::Instance() + .Get(op_desc->Type()) + .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block); + + std::list> pending_fill_zeros_ops; + for (auto& desc : grad_op_descs) { + for (const std::string& in_name : desc->InputArgumentNames()) { + if (no_grad_vars->count(in_name)) { + std::string prefix = in_name.substr( + 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); + std::string new_name = prefix + kZeroVarSuffix; + desc->Rename(in_name, new_name); + std::unique_ptr fill_zeros_op( + new OpDesc("fill_zeros_like", {{"X", {prefix}}}, + {{"Out", {new_name}}}, AttributeMap{})); + pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); + } + } + } + + for (auto& p : pending_fill_zeros_ops) { + grad_op_descs.insert(grad_op_descs.begin(), std::move(p)); + } + return grad_op_descs; +} + +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + int step_block_idx); + +std::vector> MakeBlockBackward( + ProgramDesc& program_desc, int block_idx, + std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var) { + VLOG(5) << "MakeBlockBackward"; + BlockDesc* cur_block = program_desc.MutableBlock(block_idx); + std::vector op_descs = cur_block->AllOps(); + std::unordered_map> dup_out_ops; + size_t grad_desc_idx = 0; + std::vector> backward_descs; + + for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { + VLOG(5) << "Making backward " << (*it)->Type() << " op"; + std::vector> op_grads; + + if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" || + (*it)->Type() == "parallel_do") { + int step_block_idx = (*it)->GetBlockAttr("sub_block"); + BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars, + grad_to_var, step_block_idx); + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); + } else if ((*it)->Type() == "conditional_block") { + BlockDesc* backward_block = + CreateStepBlock(program_desc, no_grad_vars, grad_to_var, + (*it)->GetBlockAttr("sub_block")); + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block}); + } else { + op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); + } + + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + sout << "Made "; + for (auto& op_grad : op_grads) { + sout << op_grad->Type() << " "; + } + VLOG(10) << sout.str(); + } + + for (const auto& desc : op_grads) { + for (const std::string& out_name : desc->OutputArgumentNames()) { + if (out_name.find("@GRAD") == std::string::npos) { + // Not all outputs of a backward operator is a gradient. Only gradient + // need to be sum. Skip variables are not gradient. + continue; + } + dup_out_ops[out_name].emplace_back(grad_desc_idx); + } + ++grad_desc_idx; + } + std::transform(op_grads.begin(), op_grads.end(), + std::back_inserter(backward_descs), + [](std::unique_ptr& ptr) { return std::move(ptr); }); + } + + VLOG(5) << "Appending Sums"; + // Check whether some variables are written more than once + std::list>> pending_sum_ops; + for (const auto& dup : dup_out_ops) { + const std::string& out_name = dup.first; + const std::vector dup_op = dup.second; + if (out_name != kEmptyVarName && dup_op.size() > 1) { + std::vector sum_op_inputs; + std::string next_g_name = out_name; + for (size_t i = 0; i < dup_op.size(); ++i) { + VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name + << " duplicated"; + std::string new_name = out_name + "@RENAME@" + std::to_string(i); + backward_descs[dup_op[i]]->RenameOutput(out_name, new_name); + backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name); + sum_op_inputs.emplace_back(new_name); + next_g_name = sum_op_inputs.back(); + } + std::unique_ptr sum_op(new OpDesc("sum", {{"X", sum_op_inputs}}, + {{"Out", {out_name}}}, + AttributeMap{})); + pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); + } + } + + pending_sum_ops.sort([](const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; + }); + for (auto& p : pending_sum_ops) { + backward_descs.insert(backward_descs.begin() + p.first + 1, + std::move(p.second)); + } + + VLOG(5) << "MakeBlockBackward Finished"; + + return backward_descs; +} + +static BlockDesc* CreateStepBlock( + ProgramDesc& program_desc, std::unordered_set* no_grad_vars, + std::unordered_map* grad_to_var, + int step_block_idx) { + auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx, + no_grad_vars, grad_to_var); + BlockDesc* backward_block = + program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx)); + for (auto& ptr : backward_block_op_descs) { + backward_block->AppendAllocatedOp(move(ptr)); + } + return backward_block; +} + +ParamGradInfoMap AppendBackward( + ProgramDesc& program_desc, const VarDesc& target, + const std::unordered_set& no_grad_vars) { + std::unordered_set no_grad_var_names; + no_grad_var_names.reserve(no_grad_vars.size() + 1); + no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix); + for (auto& name : no_grad_vars) { + no_grad_var_names.insert(GradVarName(name)); + } + + const int root_block_idx = 0; + auto root_block = program_desc.MutableBlock(root_block_idx); + + std::string fill_one_op_out = GradVarName(target.Name()); + bool is_scalar = target.GetShape() == std::vector{1}; + PADDLE_ENFORCE(is_scalar, "target should be scalar"); + VLOG(3) << "backward from loss=" << target.Name() + << " data_type=" << target.GetDataType(); + std::unique_ptr fill_one_op( + new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}}, + {{"shape", std::vector{1}}, + {"value", static_cast(1.0)}, + {"dtype", target.GetDataType()}})); + // infer var type of fill_one_op + fill_one_op->InferVarType(root_block); + + root_block->AppendAllocatedOp(std::move(fill_one_op)); + size_t forward_op_num = root_block->OpSize(); + size_t forward_block_num = program_desc.Size(); + + // Insert backward operators + std::unordered_map grad_to_var; + auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx, + &no_grad_var_names, &grad_to_var); + + for (auto& ptr : backward_op_descs) { + root_block->AppendAllocatedOp(std::move(ptr)); + } + // Create Variable + + // Create target gradient variable + std::unordered_map retv; + + auto var = root_block->Var(fill_one_op_out); + var->SetDataType(target.GetDataType()); + var->SetShape(target.GetShape()); + auto& target_grad = retv[target.Name()]; + target_grad.name_ = fill_one_op_out; + target_grad.block_idx_ = root_block_idx; + target_grad.op_idx_ = static_cast(forward_op_num); + + // create grad_var for all blocks in this program + CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv); + for (size_t block_index = forward_block_num; + block_index < program_desc.Size(); ++block_index) { + CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index), + &retv); + } + return retv; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h new file mode 100644 index 0000000000000000000000000000000000000000..2ea6922426e1dad0ca9b6e1287701bca0adef5c8 --- /dev/null +++ b/paddle/fluid/framework/backward.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +// Create the backward operator from a forward operator. +// TODO(yuyang18): Add more API reference comment. +extern std::unique_ptr Backward( + const OperatorBase& forwardOp, + const std::unordered_set& no_grad_vars); + +struct GradVarInfo { + GradVarInfo() {} + GradVarInfo(const std::string& name, int block_idx, int op_idx) + : name_(name), block_idx_(block_idx), op_idx_(op_idx) {} + + bool operator==(const GradVarInfo& b) const { + return name_ == b.name_ && block_idx_ == b.block_idx_ && + op_idx_ == b.op_idx_; + } + + std::string name_; + int block_idx_; + int op_idx_; +}; + +using ParamGradInfoMap = std::unordered_map; + +ParamGradInfoMap AppendBackward( + ProgramDesc& program_desc, const VarDesc& target, + const std::unordered_set& no_grad_vars); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9604c68913f98abc4d52c84bc8fa2c02e1a6a31 --- /dev/null +++ b/paddle/fluid/framework/backward_test.cc @@ -0,0 +1,918 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/backward.h" + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/operators/net_op.h" + +USE_NO_KERNEL_OP(fill_constant); + +namespace paddle { +namespace framework { + +using DeviceContext = platform::DeviceContext; + +class NoneOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +template +class NoneKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override {} +}; + +class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { + public: + RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input X of Add"); + AddInput("b", "Bias of Add"); + AddOutput("Out", "Out of Add"); + AddComment("Add Op"); + } +}; + +class RowWiseAddGradMaker : public SingleGradOpDescMaker { + public: + using SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad_op = new OpDesc(); + grad_op->SetInput(GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(GradVarName("X"), InputGrad("X")); + grad_op->SetOutput(GradVarName("b"), InputGrad("b")); + grad_op->SetType("rowwise_add_grad"); + return std::unique_ptr(grad_op); + } +}; + +class MulOpMaker : public OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "A"); + AddInput("Y", "B"); + AddOutput("Out", "Out"); + AddAttr("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1); + AddAttr("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1); + AddComment("Mul"); + } +}; + +class SigmoidOpMaker : public OpProtoAndCheckerMaker { + public: + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X"); + AddOutput("Out", "Y"); + AddComment("Sigmoid"); + } +}; + +class NoGradOpMaker : public OpProtoAndCheckerMaker { + public: + NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "X input"); + AddOutput("Out", "Y output"); + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class FcOp : public operators::NetOp { + public: + FcOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) + : NetOp(type, inputs, outputs, attrs) { + AppendOp(OpRegistry::CreateOp( + "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}}, + {{"Out", {Output("mul_result")}}}, AttributeMap{})); + auto input_b = Inputs("b"); + std::string before_act = "mul_result"; + if (input_b.size() != 0) { + AppendOp(OpRegistry::CreateOp( + "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}}, + {{"Out", {Output("add_result")}}}, AttributeMap{})); + before_act = "add_result"; + } else { + auto out_varname = Output("add_result"); + if (out_varname != kEmptyVarName) { + this->Rename(out_varname, kEmptyVarName); + } + } + + AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}}, + {{"Out", {Output("Out")}}}, AttributeMap{})); + CompleteAddOp(false); + } +}; + +class FcOpMaker : public OpProtoAndCheckerMaker { + public: + FcOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddInput("W", "w"); + AddInput("b", "b"); + AddOutput("mul_result", "").AsIntermediate(); + AddOutput("add_result", "").AsIntermediate(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class ManyOutputOpMaker : public OpProtoAndCheckerMaker { + public: + ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "x"); + AddOutput("y", "y"); + AddOutput("z", "z"); + AddComment(""); + } +}; + +class FillZeroOpMaker : public OpProtoAndCheckerMaker { + public: + FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddOutput("Out", "out"); + AddComment(""); + } +}; + +class SumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "the input tensors of sum operator.").AsDuplicable(); + AddOutput("Out", "the output tensor of sum operator."); + AddComment(""); + } +}; + +class MultInOutOpMaker : public OpProtoAndCheckerMaker { + public: + MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "x"); + AddInput("H", "h"); + AddOutput("Y", "y"); + AddOutput("Z", "z"); + AddComment(""); + } +}; + +class MinusGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + std::vector> retv; + auto x_g = InputGrad("X"); + if (!x_g.empty()) { + auto *op_desc = new OpDesc(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", x_g); + op_desc->SetAttr("scale", 1.0f); + retv.emplace_back(op_desc); + } + + auto y_g = InputGrad("Y"); + if (!y_g.empty()) { + auto *op_desc = new OpDesc(); + op_desc->SetType("scale"); + op_desc->SetInput("X", OutputGrad("Out")); + op_desc->SetOutput("Out", y_g); + op_desc->SetAttr("scale", -1.0f); + retv.emplace_back(op_desc); + } + return retv; + } +}; + +class MinusOpMaker : public OpProtoAndCheckerMaker { + public: + MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("Y", ""); + AddOutput("Out", ""); + AddComment("minus for unittest"); + } +}; +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +namespace ops = paddle::operators; +using EnforceNotMet = paddle::platform::EnforceNotMet; +// rowwise_add +REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker, + f::RowWiseAddGradMaker); +REGISTER_OP_CPU_KERNEL(rowwise_add, + f::NoneKernel); +REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(rowwise_add_grad, + f::NoneKernel); +// mul +REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mul_grad, + f::NoneKernel); +// sigmoid +REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sigmoid, + f::NoneKernel); +REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker); +// fill_zeros_like +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker); +REGISTER_OP_CPU_KERNEL(fill_zeros_like, + f::NoneKernel); +// sum +REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(sum_grad, + f::NoneKernel); +// fc +REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); +// many_output_op +REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker, + many_output_op_grad, f::NoneOp); +// mult_in_out +REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad, + f::NoneOp); +REGISTER_OP_CPU_KERNEL(mult_in_out, + f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mult_in_out_grad, + f::NoneKernel); +// minus +REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker); +REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel); +// scale +REGISTER_OPERATOR(scale, f::NoneOp); +REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); + +TEST(Backward, simple_op_not_need_grad) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + auto gop = f::Backward(*fwd, {"x"}); + ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName); + + auto no_input_gop = f::Backward(*fwd, {"x", "b"}); + ASSERT_NE(no_input_gop, nullptr); + ASSERT_TRUE(no_input_gop->IsNetOp()); + ASSERT_EQ(0UL, static_cast(no_input_gop.get())->ops_.size()); +} + +TEST(Backward, net_fc_backward_normal) { + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}}, + {{"mul_result", {"mul_res"}}, + {"add_result", {"add_re"}}, + {"Out", {"out"}}}, + f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(3UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.Type()); + + f::OperatorBase &d_add = *net->ops_[1]; + ASSERT_EQ("rowwise_add_grad", d_add.Type()); + + f::OperatorBase &d_mul = *net->ops_[2]; + ASSERT_EQ("mul_grad", d_mul.Type()); +} + +TEST(Backward, net_fc_backward_not_have_b) { + std::shared_ptr fwd = + f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}}, + {{"mul_result", {"mul_res"}}, + {"add_result", {"add_res"}}, + {"Out", {"tmp"}}}, + f::AttributeMap{}); + ASSERT_NE(fwd, nullptr); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); + ASSERT_TRUE(gop->IsNetOp()); + auto net = static_cast(gop.get()); + + ASSERT_NO_THROW(net->DebugString()); + + ASSERT_EQ(2UL, net->ops_.size()); + + f::OperatorBase &d_sigmoid = *net->ops_[0]; + ASSERT_EQ("sigmoid_grad", d_sigmoid.Type()); + + f::OperatorBase &d_mul = *net->ops_[1]; + ASSERT_EQ("mul_grad", d_mul.Type()); +} + +TEST(Backward, net_input_of_network_not_need_grad) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}}, + {{"mul_result", {"mul_tmp_0"}}, + {"add_result", {"add_tmp_0"}}, + {"Out", {"hidden0"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}}, + {{"mul_result", {"mul_tmp_1"}}, + {"add_result", {"add_tmp_1"}}, + {"Out", {"hidden1"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + auto bwd = Backward(net, {"x"}); // x@GRAD is not need. + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + + auto output_vars = bwd_net->OutputVars(true); + std::unordered_set all_outputs = + std::unordered_set(output_vars.begin(), output_vars.end()); + all_outputs.erase(f::kEmptyVarName); + + for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) { + ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end()); + } + + // Not Generated X + ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end()); + + ASSERT_EQ(2UL, bwd_net->ops_.size()); + ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp()); + auto first_fc_grad = static_cast(bwd_net->ops_[1].get()); + ASSERT_EQ(3UL, first_fc_grad->ops_.size()); + ASSERT_EQ(f::kEmptyVarName, + first_fc_grad->ops_[2]->Output(f::GradVarName("X"))); +} + +TEST(Backward, net_shared_weight) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}}, + {{"Out", {"out"}}}, f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}}, + {{"Out", {"FinalOut"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + + auto bwd = f::Backward(net, std::unordered_set{}); + ASSERT_TRUE(bwd->IsNetOp()); + auto bwd_net = static_cast(bwd.get()); + ASSERT_EQ(3UL, bwd_net->ops_.size()); + ASSERT_EQ("sum", bwd_net->ops_[2]->Type()); +} + +TEST(Backward, op_all_input_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"x", "b"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_all_output_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"out"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_TRUE(net->ops_.empty()); +} + +TEST(Backward, op_part_of_output_are_not_need) { + auto fwd = + f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, + {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"Z"}); + ASSERT_TRUE(backward->IsNetOp()); + auto net = static_cast(backward.get()); + ASSERT_EQ(net->ops_.size(), 2UL); + + auto &fill_zero = *net->ops_[0]; + ASSERT_EQ("fill_zeros_like", fill_zero.Type()); + ASSERT_EQ(1UL, fill_zero.Inputs("X").size()); + ASSERT_EQ("Z", fill_zero.Input("X")); + ASSERT_EQ(1UL, fill_zero.Outputs("Out").size()); + ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out")); + + auto &d_many_out = *net->ops_[1]; + ASSERT_EQ("many_output_op_grad", d_many_out.Type()); + ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size()); // I/O/OG + ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, + d_many_out.Input(f::GradVarName("z"))); + ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y"))); + ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x"))); +} + +TEST(Backward, op_part_of_input_are_not_need) { + auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); + auto backward = f::Backward(*fwd, {"a"}); + auto &grad_mul = *backward; + ASSERT_EQ(grad_mul.Type(), "mul_grad"); + ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL); + ASSERT_EQ(grad_mul.Outputs().size(), 2UL); + ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName); + ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b")); + ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out")); + ASSERT_EQ(grad_mul.Input("X"), "a"); + ASSERT_EQ(grad_mul.Input("Y"), "b"); + ASSERT_EQ(grad_mul.Input("Out"), "out"); +} + +TEST(Backward, linear_net_intermediate_variable_has_no_grad) { + ops::NetOp net; + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"mul_result", {"mul_out1"}}, + {"add_result", {"add_out1"}}, + {"Out", {"out1"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}}, + {{"mul_result", {"mul_out2"}}, + {"add_result", {"tmp_out2"}}, + {"Out", {"out2"}}}, + f::AttributeMap{})); + net.AppendOp(f::OpRegistry::CreateOp( + "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}}, + {{"mul_result", {"mul_out3"}}, + {"add_result", {"tmp_out3"}}, + {"Out", {"out3"}}}, + f::AttributeMap{})); + net.CompleteAddOp(); + + auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); + ASSERT_TRUE(backward->IsNetOp()); + auto bwd_net = static_cast(backward.get()); + ASSERT_EQ(bwd_net->ops_.size(), 3UL); + auto &grad_fc = *bwd_net->ops_[0]; + + const char *all = paddle::operators::NetOp::kAll; + EXPECT_EQ(grad_fc.Inputs(all).size(), + 2UL /* external input number */ + + 1UL /* external output number*/ + + 1UL /* number of gradient of external output*/ + + 2UL /* internal variable number*/ + ); + EXPECT_EQ(grad_fc.Outputs(all).size(), + 2UL /* input number of mul*/ + + 2UL /* input number of rowwise_add*/ + + 1UL /* input number of sigmod */ + - 1UL /* out2 is not needed*/); + EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL); + EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL); +} + +TEST(Backward, simple_single_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + f::OpDesc *op = block->AppendOp(); + op->SetType("rowwise_add"); + op->SetInput("X", {"x"}); + op->SetInput("b", {"b"}); + op->SetOutput("Out", {"out"}); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 3UL); + f::OpDesc *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op = block->AllOps()[2]; + EXPECT_EQ(grad_op->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op->InputNames().size(), 1UL); + ASSERT_EQ(grad_op->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out")})); + EXPECT_EQ(grad_op->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x")})); + EXPECT_EQ(grad_op->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b")})); + + EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2)); + EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("b"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x"))); +} + +TEST(Backward, default_attribute) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op = block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {"x"}); + op->SetInput("Y", {"y"}); + op->SetOutput("Out", {"out"}); + op->CheckAttrs(); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 3UL); + EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); + EXPECT_EQ(boost::get(op->GetAttr("y_num_col_dims")), 1); + + f::OpDesc *fill_op = block->AllOps()[1]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op = block->AllOps()[2]; + ASSERT_EQ(grad_op->Type(), "mul_grad"); + EXPECT_EQ(boost::get(grad_op->GetAttr("x_num_col_dims")), 1); + EXPECT_EQ(boost::get(grad_op->GetAttr("y_num_col_dims")), 1); +} + +TEST(Backward, simple_mult_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"out1"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out2"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + auto target = f::VarDesc("out3"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 6UL + 1); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op1 = block->AllOps()[6]; + EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + f::OpDesc *grad_op2 = block->AllOps()[5]; + EXPECT_EQ(grad_op2->Type(), "mul_grad"); + ASSERT_EQ(grad_op2->InputNames().size(), 4UL); + ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op2->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op2->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op2->Input("Out"), std::vector({"out2"})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + + f::OpDesc *grad_op3 = block->AllOps()[4]; + EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op3->InputNames().size(), 1UL); + ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out3")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b3")})); + + EXPECT_EQ(var_to_grad.size(), 7UL); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 5)); + EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); + EXPECT_EQ(var_to_grad.at("out2"), + f::GradVarInfo(f::GradVarName("out2"), 0, 4)); + EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("y2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b3"))); +} + +TEST(Backward, intermedia_var_no_grad) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"x2"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out2"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + f::OpDesc *op4 = block->AppendOp(); + op4->SetType("mul"); + op4->SetInput("X", {"out1"}); + op4->SetInput("Y", {"out3"}); + op4->SetOutput("Out", {"out4"}); + + auto target = f::VarDesc("out4"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"out3"}); + + ASSERT_EQ(block->AllOps().size(), 7UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op1 = block->AllOps()[6]; + EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + f::OpDesc *grad_op4 = block->AllOps()[5]; + EXPECT_EQ(grad_op4->Type(), "mul_grad"); + ASSERT_EQ(grad_op4->InputNames().size(), 4UL); + ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op4->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op4->Input("Y"), std::vector({"out3"})); + EXPECT_EQ(grad_op4->Input("Out"), std::vector({"out4"})); + EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out4")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector()); + + EXPECT_EQ(var_to_grad.size(), 4UL); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 5)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); +} + +TEST(Backward, var_no_grad) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("mult_in_out"); + op1->SetInput("X", {"x1"}); + op1->SetInput("H", {"h1"}); + op1->SetOutput("Y", {"y1"}); + op1->SetOutput("Z", {"z1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mult_in_out"); + op2->SetInput("X", {"y1"}); + op2->SetInput("H", {"z1"}); + op2->SetOutput("Y", {"y2"}); + op2->SetOutput("Z", {"z2"}); + + auto target = f::VarDesc("z2"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"z1"}); + + ASSERT_EQ(block->AllOps().size(), 6UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op2 = block->AllOps()[3]; + ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad"); + ASSERT_EQ(grad_op2->InputNames().size(), 6UL); + ASSERT_EQ(grad_op2->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op2->Input("X"), std::vector({"y1"})); + EXPECT_EQ(grad_op2->Input("H"), std::vector({"z1"})); + EXPECT_EQ(grad_op2->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op2->Input("Z"), std::vector({"z2"})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")), + std::vector({f::GradVarName("z2")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("X")), + std::vector({f::GradVarName("y1")})); + EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector()); + + f::OpDesc *fill_zero_op = block->AllOps()[4]; + ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like"); + ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL); + ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL); + EXPECT_EQ(fill_zero_op->Input("X"), std::vector({"z1"})); + EXPECT_EQ(fill_zero_op->Output("Out"), + std::vector({std::string("z1") + f::kZeroVarSuffix})); + + f::OpDesc *grad_op1 = block->AllOps()[5]; + ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 6UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input("X"), std::vector({"x1"})); + EXPECT_EQ(grad_op1->Input("H"), std::vector({"h1"})); + EXPECT_EQ(grad_op1->Input("Y"), std::vector({"y1"})); + EXPECT_EQ(grad_op1->Input("Z"), std::vector({"z1"})); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")), + std::vector({f::GradVarName("y1")})); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")), + std::vector({std::string("z1") + f::kZeroVarSuffix})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("H")), + std::vector({f::GradVarName("h1")})); + + EXPECT_EQ(var_to_grad.size(), 4UL); + EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3)); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5)); + EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("y1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("h1"))); +} + +TEST(Backward, shared_var) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::OpDesc *op1 = block->AppendOp(); + op1->SetType("rowwise_add"); + op1->SetInput("X", {"x1"}); + op1->SetInput("b", {"b1"}); + op1->SetOutput("Out", {"out1"}); + + f::OpDesc *op2 = block->AppendOp(); + op2->SetType("mul"); + op2->SetInput("X", {"out1"}); + op2->SetInput("Y", {"y2"}); + op2->SetOutput("Out", {"out2"}); + + f::OpDesc *op3 = block->AppendOp(); + op3->SetType("rowwise_add"); + op3->SetInput("X", {"out1"}); + op3->SetInput("b", {"b3"}); + op3->SetOutput("Out", {"out3"}); + + auto target = f::VarDesc("out3"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); + + ASSERT_EQ(block->AllOps().size(), 8UL); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + + f::OpDesc *grad_op3 = block->AllOps()[4]; + ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op3->InputNames().size(), 1UL); + ASSERT_EQ(grad_op3->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out3")})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1") + "@RENAME@0"})); + EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b3")})); + + f::OpDesc *grad_op4 = block->AllOps()[5]; + ASSERT_EQ(grad_op4->Type(), "mul_grad"); + ASSERT_EQ(grad_op4->InputNames().size(), 4UL); + ASSERT_EQ(grad_op4->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op4->Input("X"), std::vector({"out1"})); + EXPECT_EQ(grad_op4->Input("Y"), std::vector({"y2"})); + EXPECT_EQ(grad_op4->Input("Out"), std::vector({"out2"})); + EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out2")})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("X")), + std::vector({f::GradVarName("out1") + "@RENAME@1"})); + EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), + std::vector({f::GradVarName("y2")})); + + f::OpDesc *sum_op = block->AllOps()[6]; + ASSERT_EQ(sum_op->Type(), "sum"); + ASSERT_EQ(sum_op->InputNames().size(), 1UL); + ASSERT_EQ(sum_op->OutputNames().size(), 1UL); + EXPECT_EQ(sum_op->Input("X"), + std::vector({f::GradVarName("out1") + "@RENAME@0", + f::GradVarName("out1") + "@RENAME@1"})); + EXPECT_EQ(sum_op->Output("Out"), + std::vector({f::GradVarName("out1")})); + + f::OpDesc *grad_op1 = block->AllOps()[7]; + ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad"); + ASSERT_EQ(grad_op1->InputNames().size(), 1UL); + ASSERT_EQ(grad_op1->OutputNames().size(), 2UL); + EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")), + std::vector({f::GradVarName("out1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("X")), + std::vector({f::GradVarName("x1")})); + EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), + std::vector({f::GradVarName("b1")})); + + EXPECT_EQ(var_to_grad.size(), 6UL); + EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); + EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); + EXPECT_EQ(var_to_grad.at("out1"), + f::GradVarInfo(f::GradVarName("out1"), 0, 6)); + EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7)); + EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7)); + + EXPECT_TRUE(block->HasVar(f::GradVarName("b3"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("y2"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("out1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("x1"))); + EXPECT_TRUE(block->HasVar(f::GradVarName("b1"))); +} + +TEST(Backward, half_backward) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + auto *op1 = block->AppendOp(); + op1->SetType("minus"); + op1->SetInput("X", {"a"}); + op1->SetInput("Y", {"b"}); + op1->SetOutput("Out", {"out"}); + + auto target = f::VarDesc("out"); + target.SetShape({1}); + size_t forward_len = block->AllOps().size(); + auto var_to_grad = AppendBackward(program, target, {"b"}); + f::OpDesc *fill_op = block->AllOps()[forward_len]; + EXPECT_EQ(fill_op->Type(), "fill_constant"); + auto ops = block->AllOps(); + ASSERT_EQ(3UL, ops.size()); + + EXPECT_EQ(var_to_grad.size(), 2UL); + EXPECT_EQ(var_to_grad.at("a"), + f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1)); +} diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..dccdbf15fffda1d1702595fb6b5766561cf2a333 --- /dev/null +++ b/paddle/fluid/framework/block_desc.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +VarDesc *BlockDesc::Var(const std::string &name) { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } + need_update_ = true; + auto *var = new VarDesc(name); + vars_[name].reset(var); + return var; +} + +VarDesc *BlockDesc::FindVar(const std::string &name) const { + auto it = vars_.find(name); + if (it == vars_.end()) { + return nullptr; + } + return it->second.get(); +} + +bool BlockDesc::HasVar(const std::string &name) const { + return vars_.find(name) != vars_.end(); +} + +VarDesc *BlockDesc::RenameVar(const std::string &old_name, + const std::string &new_name) { + if (!this->HasVar(old_name)) { + return nullptr; + } + need_update_ = true; + auto *var = this->Var(old_name); + VarDesc *new_var = new VarDesc(*(var->Proto())); + new_var->SetName(new_name); + // new_var->SetShape(var->GetShape()); + // new_var->SetType(var->GetType()); + // new_var->SetDataType(var->GetDataType()); + // new_var->SetLoDLevel(var->GetLoDLevel()); + // new_var->SetPersistable(var->Persistable()); + + vars_[new_name].reset(new_var); + + // rename inputs and outputs + for (const auto &op : ops_) { + auto *it = op.get(); + it->Rename(old_name, new_name); + } + vars_.erase(old_name); + return new_var; +} + +VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const { + if (name == kEmptyVarName) return nullptr; + + auto it = vars_.find(name); + if (it == vars_.end()) { + return Parent() == kNoneBlockIndex ? nullptr + : ParentBlock()->FindVarRecursive(name); + } + return it->second.get(); +} + +VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) { + VarDesc *res = FindVarRecursive(name_bytes); + if (res == nullptr) { + res = Var(name_bytes); + } + return *res; +} + +bool BlockDesc::HasVarRecursive(const std::string &name) const { + return FindVarRecursive(name) != nullptr; +} + +std::vector BlockDesc::AllVars() const { + std::vector res; + for (const auto &p : vars_) { + res.push_back(p.second.get()); + } + return res; +} + +OpDesc *BlockDesc::AppendOp() { + need_update_ = true; + ops_.emplace_back(new OpDesc(this)); + return ops_.back().get(); +} + +void BlockDesc::AppendAllocatedOp(std::unique_ptr &&op_desc) { + need_update_ = true; + ops_.emplace_back(std::move(op_desc)); +} + +OpDesc *BlockDesc::PrependOp() { + need_update_ = true; + ops_.emplace_front(new OpDesc(this)); + return ops_.front().get(); +} + +void BlockDesc::RemoveOp(size_t s, size_t e) { + if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { + return; + } + need_update_ = true; + for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { + auto names = (*it)->InputArgumentNames(); + for (auto n : names) { + // TODO(typhoonzero): delete vars if no other op use it. + VLOG(3) << "deleting var " << n; + } + } + ops_.erase(ops_.begin() + s, ops_.begin() + e); +} + +std::vector BlockDesc::AllOps() const { + std::vector res; + for (const auto &op : ops_) { + res.push_back(op.get()); + } + return res; +} + +void BlockDesc::Flush() { + for (auto &op_desc : ops_) { + op_desc->Flush(); + } + + if (need_update_) { + auto &op_field = *this->desc_->mutable_ops(); + this->ClearPBOps(); + op_field.Reserve(static_cast(ops_.size())); + for (auto &op_desc : ops_) { + op_field.AddAllocated(op_desc->Proto()); + } + auto &var_field = *this->desc_->mutable_vars(); + this->ClearPBVars(); + var_field.Reserve(static_cast(vars_.size())); + for (auto &var_desc : vars_) { + var_field.AddAllocated(var_desc.second->Proto()); + } + need_update_ = false; + } +} + +BlockDesc *BlockDesc::ParentBlock() const { + if (this->desc_->parent_idx() == kNoneBlockIndex) { + return nullptr; + } + return prog_->MutableBlock(static_cast(this->desc_->parent_idx())); +} + +proto::BlockDesc *BlockDesc::Proto() { + Flush(); + return desc_; +} + +BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) + : prog_(prog), desc_(desc), need_update_(false) { + for (const proto::VarDesc &var_desc : desc_->vars()) { + vars_[var_desc.name()].reset(new VarDesc(var_desc)); + } + for (const proto::OpDesc &op_desc : desc_->ops()) { + ops_.emplace_back(new OpDesc(op_desc, prog, this)); + } +} + +BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, + ProgramDesc *prog) + : prog_(prog), desc_(desc) { + need_update_ = true; + for (auto &op : other.ops_) { + ops_.emplace_back(new OpDesc(*op->Proto(), prog, this)); + } + for (auto &it : other.vars_) { + auto *var = new VarDesc(*it.second); + vars_[it.first].reset(var); + } +} + +void BlockDesc::ClearPBOps() { + auto ops = this->desc_->mutable_ops(); + while (!ops->empty()) { + // we do not own the OpDesc, so release the ownership. + ops->ReleaseLast(); + } +} + +void BlockDesc::ClearPBVars() { + auto vars = this->desc_->mutable_vars(); + while (!vars->empty()) { + // we do not own the VarDesc, so release the ownership. + vars->ReleaseLast(); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..71722b78a4ca0b2f499f9961e75980ca1af39e25 --- /dev/null +++ b/paddle/fluid/framework/block_desc.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class ProgramDesc; + +// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize +// read/write speed. Only when we want the protobuf message, the local changes +// will be synchronized (by `Sync` method). + +class BlockDesc { + public: + BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc); + + BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog); + + ~BlockDesc() { + this->ClearPBVars(); + this->ClearPBOps(); + } + + int32_t ID() const { return desc_->idx(); } + + int32_t Parent() const { return desc_->parent_idx(); } + + VarDesc *Var(const std::string &name_bytes); + + VarDesc *FindVar(const std::string &name_bytes) const; + + bool HasVar(const std::string &var_name) const; + + VarDesc *RenameVar(const std::string &old_name, const std::string &new_name); + + VarDesc *FindVarRecursive(const std::string &name_bytes) const; + + VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes); + + bool HasVarRecursive(const std::string &var_name) const; + + std::set LocalVarNames() const { + std::set var_names; + for (auto &var : vars_) { + var_names.insert(var.first); + } + return var_names; + } + + std::vector AllVars() const; + + BlockDesc *ParentBlock() const; + + OpDesc *AppendOp(); + + void AppendAllocatedOp(std::unique_ptr &&op_desc); + + OpDesc *PrependOp(); + + void RemoveOp(size_t s, size_t e); + + std::vector AllOps() const; + + size_t OpSize() const { return ops_.size(); } + + OpDesc *Op(int idx) { return ops_.at(idx).get(); } + + void Flush(); + + proto::BlockDesc *Proto(); + + ProgramDesc *Program() { return this->prog_; } + + private: + void ClearPBOps(); + void ClearPBVars(); + + private: + ProgramDesc *prog_; // not_own + proto::BlockDesc *desc_; // not_own + bool need_update_; + + std::deque> ops_; + std::unordered_map> vars_; + + DISABLE_COPY_AND_ASSIGN(BlockDesc); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h new file mode 100644 index 0000000000000000000000000000000000000000..5acf4fb39bbeb6bd45d215c962f10f0333578c02 --- /dev/null +++ b/paddle/fluid/framework/channel.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for size_t + +namespace paddle { +namespace framework { + +// Channel is the abstract class of buffered and un-buffered channels. +template +class Channel { + public: + virtual bool Send(T*) = 0; + virtual bool Receive(T*) = 0; + virtual size_t Cap() = 0; + virtual void Close() = 0; + virtual ~Channel() {} +}; + +// Forward declaration of channel implementations. +namespace details { +template +class Buffered; +template +class UnBuffered; +} // namespace details + +template +Channel* MakeChannel(size_t buffer_size) { + if (buffer_size > 0) { + return new details::Buffered(buffer_size); + } + return new details::UnBuffered(); +} + +template +void CloseChannel(Channel* ch) { + ch->Close(); +} + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/details/buffered_channel.h" +#include "paddle/fluid/framework/details/unbuffered_channel.h" diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..953fa40fec8c0480726b44760a3a4c7f59c80a85 --- /dev/null +++ b/paddle/fluid/framework/channel_test.cc @@ -0,0 +1,510 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/channel.h" + +#include +#include + +#include "gtest/gtest.h" + +using paddle::framework::Channel; +using paddle::framework::MakeChannel; +using paddle::framework::CloseChannel; +using paddle::framework::details::Buffered; +using paddle::framework::details::UnBuffered; + +void RecevingOrderEqualToSendingOrder(Channel *ch) { + unsigned sum_send = 0; + std::thread t([&]() { + for (int i = 0; i < 5; i++) { + EXPECT_EQ(ch->Send(&i), true); + sum_send += i; + } + }); + for (int i = 0; i < 5; i++) { + int recv; + EXPECT_EQ(ch->Receive(&recv), true); + EXPECT_EQ(recv, i); + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 10U); + delete ch; +} + +TEST(Channel, MakeAndClose) { + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + { + // MakeChannel should return a buffered channel is buffer_size > 0. + auto ch = MakeChannel(10); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } + { + // MakeChannel should return an un-buffered channel is buffer_size = 0. + auto ch = MakeChannel(0); + EXPECT_EQ(dynamic_cast *>(ch), nullptr); + EXPECT_NE(dynamic_cast *>(ch), nullptr); + CloseChannel(ch); + delete ch; + } +} + +TEST(Channel, SufficientBufferSizeDoesntBlock) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Send(&i), true); // should not block + } + + size_t out; + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), true); // should not block + EXPECT_EQ(out, i); + } + CloseChannel(ch); + delete ch; +} + +// This tests that a channel must return false +// on send and receive performed after closing the channel. +// Receive will only return false after close when queue is empty. +// By creating separate threads for sending and receiving, we make this +// function able to test both buffered and unbuffered channels. +void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { + const size_t data = 5; + std::thread send_thread{[&]() { + size_t i = data; + EXPECT_EQ(ch->Send(&i), true); // should not block + }}; + + std::thread recv_thread{[&]() { + size_t i; + EXPECT_EQ(ch->Receive(&i), true); // should not block + EXPECT_EQ(i, data); + }}; + + send_thread.join(); + recv_thread.join(); + + // After closing send should return false. Receive should + // also return false as there is no data in queue. + CloseChannel(ch); + send_thread = std::thread{[&]() { + size_t i = data; + EXPECT_EQ(ch->Send(&i), false); // should return false + }}; + recv_thread = std::thread{[&]() { + size_t i; + // should return false because channel is closed and queue is empty + EXPECT_EQ(ch->Receive(&i), false); + }}; + + send_thread.join(); + recv_thread.join(); +} + +TEST(Channel, SendReceiveClosedBufferedChannelPanics) { + size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + SendReceiveWithACloseChannelShouldPanic(ch); + delete ch; +} + +TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) { + auto ch = MakeChannel(0); + SendReceiveWithACloseChannelShouldPanic(ch); + delete ch; +} + +TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Send(&i), true); // sending should not block + } + + size_t out; + for (size_t i = 0; i < buffer_size / 2; ++i) { + EXPECT_EQ(ch->Receive(&out), true); // receiving should not block + EXPECT_EQ(out, i); + } + + CloseChannel(ch); + + for (size_t i = buffer_size / 2; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), + true); // receving should return residual values. + EXPECT_EQ(out, i); + } + + for (size_t i = 0; i < buffer_size; ++i) { + EXPECT_EQ(ch->Receive(&out), + false); // receiving on closed channel should return false + } + delete ch; +} + +TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + size_t sum = 0; + std::thread t([&]() { + // Try to write more than buffer size. + for (size_t i = 0; i < 2 * buffer_size; ++i) { + if (i < buffer_size) + EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations + else + EXPECT_EQ(ch->Send(&i), false); + sum += i; + } + }); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + EXPECT_EQ(sum, 45U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { + auto ch = MakeChannel(0); + RecevingOrderEqualToSendingOrder(ch); +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { + auto ch = MakeChannel(10); + RecevingOrderEqualToSendingOrder(ch); +} + +void ChannelCloseUnblocksReceiversTest(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + + // Explicitly close the channel + // This should unblock all receivers + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +void ChannelCloseUnblocksSendersTest(Channel *ch) { + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + if (dynamic_cast *>(ch)) { + // If ch is Buffered, atleast 4 threads must be blocked. + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (!thread_ended[i]) ct++; + } + EXPECT_GE(ct, 4); + } else { + // If ch is UnBuffered, all the threads should be blocked. + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + } + // Explicitly close the thread + // This should unblock all senders + CloseChannel(ch); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + if (dynamic_cast *>(ch)) { + // Verify that only 1 send was successful + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +// This tests that closing a buffered channel also unblocks +// any receivers waiting on the channel +TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any senders waiting for channel to have write space +TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksSendersTest(ch); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(0); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +// This tests that closing an unbuffered channel also unblocks +// unblocks any senders waiting for senders +TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(0); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +TEST(Channel, UnbufferedLessReceiveMoreSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + // Send should block after three iterations + // since we only have three receivers. + std::thread t([&]() { + // Try to send more number of times + // than receivers + for (int i = 0; i < 4; i++) { + ch->Send(&i); + sum_send += i; + } + }); + for (int i = 0; i < 3; i++) { + int recv; + ch->Receive(&recv); + EXPECT_EQ(recv, i); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + EXPECT_EQ(sum_send, 3U); + + CloseChannel(ch); + t.join(); + delete ch; +} + +TEST(Channel, UnbufferedMoreReceiveLessSendTest) { + auto ch = MakeChannel(0); + unsigned sum_send = 0; + unsigned sum_receive = 0; + // The receiver should block after 5 + // iterations, since there are only 5 senders. + std::thread t([&]() { + for (int i = 0; i < 8; i++) { + int recv; + ch->Receive(&recv); // should block after the fifth iteration. + EXPECT_EQ(recv, i); + sum_receive += i; + } + }); + for (int i = 0; i < 5; i++) { + ch->Send(&i); + sum_send += i; + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + EXPECT_EQ(sum_send, 10U); + EXPECT_EQ(sum_receive, 10U); + // send three more elements + for (int i = 5; i < 8; i++) { + ch->Send(&i); + sum_send += i; + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 28U); + EXPECT_EQ(sum_receive, 28U); + delete ch; +} + +// This tests that destroying a channel unblocks +// any senders waiting for channel to have write space +void ChannelDestroyUnblockSenders(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + bool send_success[num_threads]; + + // Launches threads that try to write and are blocked because of no readers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + send_success[i] = false; + t[i] = std::thread( + [&](bool *ended, bool *success) { + int data = 10; + *success = ch->Send(&data); + *ended = true; + }, + &thread_ended[i], &send_success[i]); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + bool is_buffered_channel = false; + if (dynamic_cast *>(ch)) is_buffered_channel = true; + + if (is_buffered_channel) { + // If channel is buffered, verify that atleast 4 threads are blocked + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (thread_ended[i] == false) ct++; + } + // Atleast 4 threads must be blocked + EXPECT_GE(ct, 4); + } else { + // Verify that all the threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + } + // Explicitly destroy the channel + delete ch; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + // Count number of successfuld sends + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + + if (is_buffered_channel) { + // Only 1 send must be successful + EXPECT_EQ(ct, 1); + } else { + // In unbuffered channel, no send should be successful + EXPECT_EQ(ct, 0); + } + + // Join all threads + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +// This tests that destroying a channel also unblocks +// any receivers waiting on the channel +void ChannelDestroyUnblockReceivers(Channel *ch) { + size_t num_threads = 5; + std::thread t[num_threads]; + bool thread_ended[num_threads]; + + // Launches threads that try to read and are blocked because of no writers + for (size_t i = 0; i < num_threads; i++) { + thread_ended[i] = false; + t[i] = std::thread( + [&](bool *p) { + int data; + // All reads should return false + EXPECT_EQ(ch->Receive(&data), false); + *p = true; + }, + &thread_ended[i]); + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + + // Verify that all threads are blocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } + // delete the channel + delete ch; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + // Verify that all threads got unblocked + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], true); + } + + for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { + size_t buffer_size = 1; + auto ch = MakeChannel(buffer_size); + ChannelDestroyUnblockReceivers(ch); +} + +TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { + size_t buffer_size = 1; + auto ch = MakeChannel(buffer_size); + ChannelDestroyUnblockSenders(ch); +} + +// This tests that destroying an unbuffered channel also unblocks +// unblocks any receivers waiting for senders +TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { + auto ch = MakeChannel(0); + ChannelDestroyUnblockReceivers(ch); +} + +TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { + auto ch = MakeChannel(0); + ChannelDestroyUnblockSenders(ch); +} diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c6dd28455b02aa71d8ed09d8c2c81397a6f9955 --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_device_transform.h" + +namespace paddle { +namespace framework { + +static const platform::DeviceContext* GetDeviceContext( + const platform::Place& src_place, const platform::Place& dst_place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) { + return pool.Get(src_place); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + return pool.Get(dst_place); + } else { + PADDLE_THROW( + "Currently, model parallelism is only supported between CPU and CUDA"); + } +} + +void TransDataDevice(const Tensor& in, const platform::Place& dst_place, + Tensor* out) { + VLOG(3) << "DeviceTransform in, src_place " << in.place() + << " dst_place: " << dst_place; + auto* dev_ctx = GetDeviceContext(in.place(), dst_place); + dev_ctx->Wait(); + Copy(in, dst_place, *dev_ctx, out); + dev_ctx->Wait(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..0c4559f586aaf3cc055f9b53b050b3f3a97573bd --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +void TransDataDevice(const Tensor& in, const platform::Place& dst_place, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f740f9b3268be31973774674bdea9eb404f718ed --- /dev/null +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input1 of test op"); + AddOutput("output", "output of test op"); + AddAttr("use_gpu", "force to use gpu kernel").SetDefault(false); + AddComment("This is test op"); + } +}; + +class TestOpWithKernel : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + if (Attr("use_gpu")) { + VLOG(3) << "force use gpu kernel"; + return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0)); + } else { + VLOG(3) << "use default kernel"; + return OpKernelType(proto::DataType::FP32, + ctx.Input("input")->place()); + } + } +}; + +template +class TestKernel : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + + const Tensor* input = ctx.Input("input"); + + std::cout << "input place:" << input->place() << std::endl; + auto* output = ctx.Output("output"); + output->Resize(input->dims()); + output->mutable_data(ctx.GetPlace()); + + operators::TransformFunctor, T, DeviceContext> functor( + input, input, output, ctx.template device_context(), + AddFunctor()); + functor.Run(); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + test_op, paddle::framework::TestOpWithKernel, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL( + test_op, + paddle::framework::TestKernel); +REGISTER_OP_CUDA_KERNEL( + test_op, + paddle::framework::TestKernel); + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +TEST(Operator, CPUtoGPU) { + using namespace paddle::framework; + using namespace paddle::platform; + InitDevices(); + + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + + // create an op to run on CPU + paddle::framework::proto::OpDesc cpu_op_desc; + cpu_op_desc.set_type("test_op"); + BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs()); + + auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc); + // prepare input + auto* in_t = scope.Var("IN1")->GetMutable(); + auto* src_ptr = in_t->mutable_data({2, 3}, CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + + // get output + auto* output = scope.Var("OUT1"); + cpu_op->Run(scope, cpu_place); + + auto* output_ptr = output->Get().data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output_ptr[i], static_cast(i) * 2); + } + + // create an op to run on GPU + paddle::framework::proto::OpDesc gpu_op_desc; + gpu_op_desc.set_type("test_op"); + BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs()); + BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs()); + + auto attr = gpu_op_desc.mutable_attrs()->Add(); + attr->set_name("use_gpu"); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(true); + + auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc); + + paddle::platform::CUDAPlace cuda_place(0); + // get output + auto* output2 = scope.Var("OUT2"); + gpu_op->Run(scope, cuda_place); + VLOG(3) << "after gpu_op run"; + + // auto* output2_ptr = output2->Get().data(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(cuda_place); + + paddle::framework::Tensor output_tensor; + Copy(output2->Get(), paddle::platform::CPUPlace(), *dev_ctx, + &output_tensor); + + dev_ctx->Wait(); + float* output2_ptr = output_tensor.data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output2_ptr[i], static_cast(i) * 4); + } +} diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h new file mode 100644 index 0000000000000000000000000000000000000000..b72f13f2e8f28556c195e65e3096b6ef1ba9e13a --- /dev/null +++ b/paddle/fluid/framework/data_layout.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +enum class DataLayout { + kNHWC = 0, + kNCHW = 1, + kAnyLayout = 2, +}; + +inline DataLayout StringToDataLayout(const std::string& str) { + std::string s(str); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = toupper(s[i]); + } + + if (s == "NHWC") { + return DataLayout::kNHWC; + } else if (s == "NCHW") { + return DataLayout::kNCHW; + } else if (s == "ANYLAYOUT") { + return DataLayout::kAnyLayout; + } else { + PADDLE_THROW("Unknown storage order string: %s", s); + } +} + +inline std::string DataLayoutToString(const DataLayout& data_layout) { + switch (data_layout) { + case DataLayout::kNHWC: + return "NHWC"; + case DataLayout::kNCHW: + return "NCHW"; + case DataLayout::kAnyLayout: + return "ANY_LAYOUT"; + default: + PADDLE_THROW("unknown DataLayou %d", data_layout); + } +} + +inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) { + out << DataLayoutToString(l); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..c546a508fe1bc1c6e1608cb16a2e1f708a083895 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" + +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace framework { + +std::vector GetAxis(const DataLayout& from, const DataLayout& to) { + PADDLE_ENFORCE_NE(from, to, + "layout transform should transform different layout"); + if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) { + return {0, 2, 3, 1}; + } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) { + return {0, 3, 1, 2}; + } else { + PADDLE_THROW("unsupported transform"); + } +} + +struct CastDataLayout { + CastDataLayout(const platform::DeviceContext* ctx, + const std::vector& axis, const framework::Tensor& in, + framework::Tensor* out) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_type_for_var.place_, + expected_kernel_type.place_), + "TransDataLayout only support DataLayout transform on same place!"); + + PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!"); + + auto& pool = platform::DeviceContextPool::Instance(); + + auto src_dim = in.dims(); + std::vector dst_dim; + + auto axis = GetAxis(kernel_type_for_var.data_layout_, + expected_kernel_type.data_layout_); + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + out->Resize(make_ddim(dst_dim)); + out->mutable_data(expected_kernel_type.place_, in.type()); + + framework::VisitDataType( + framework::ToDataType(in.type()), + CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); + + out->set_layout(expected_kernel_type.data_layout_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..862405fbf466cd5e5fc819f42cc7392b1c4ca624 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.h @@ -0,0 +1,31 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +std::vector GetAxis(const DataLayout& from, const DataLayout& to); + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..99eb46bde34b089c3da65885748a1e77fe40c700 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(DataTransform, DataLayoutFunction) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto place = CPUPlace(); + Tensor in = Tensor(); + Tensor out = Tensor(); + in.mutable_data(make_ddim({2, 3, 1, 2}), place); + in.set_layout(DataLayout::kNHWC); + + auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place, + DataLayout::kNHWC, LibraryType::kPlain); + auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place, + DataLayout::kNCHW, LibraryType::kPlain); + + TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out); + + EXPECT_TRUE(out.layout() == DataLayout::kNCHW); + EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1})); + + TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out); + + EXPECT_TRUE(in.layout() == DataLayout::kNHWC); + EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2})); +} \ No newline at end of file diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..9575d01af8875cc21061979e54ce0612d8a7f3a5 --- /dev/null +++ b/paddle/fluid/framework/data_transform.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_transform.h" + +#include "paddle/fluid/framework/data_device_transform.h" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/data_type_transform.h" + +namespace paddle { +namespace framework { + +static void PassTensorData(Tensor* from, Tensor* to) { + to->ShareDataWith(*from); + *from = Tensor(); +} + +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* output_tensor) { + bool transformed = false; + Tensor in; + in.ShareDataWith(input_tensor); + Tensor out; + + // do layout transform + if (NeedTransformLayout(expected_kernel_type.data_layout_, + kernel_type_for_var.data_layout_)) { + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + transformed = true; + PassTensorData(&out, &in); + } + + if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) { + TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); + transformed = true; + PassTensorData(&out, &in); + } + + // do device transform + if (!platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_type.place_)) { + TransDataDevice(in, expected_kernel_type.place_, &out); + transformed = true; + PassTensorData(&out, &in); + } + + PADDLE_ENFORCE(transformed, "No transform is applied, please check!"); + // get output data + output_tensor->ShareDataWith(in); +} + +void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, + Variable& out_var) { + if (in_var.IsType()) { + auto& in_lod_tensor = in_var.Get(); + auto* tran_lod_tensor = out_var.GetMutable(); + tran_lod_tensor->set_lod(in_lod_tensor.lod()); + tran_lod_tensor->set_layout(in_lod_tensor.layout()); + tran_lod_tensor->ShareDataWith(tensor); + } else if (in_var.IsType()) { + auto& in_selected_rows = in_var.Get(); + auto* trans_selected_rows = out_var.GetMutable(); + trans_selected_rows->set_height(in_selected_rows.height()); + trans_selected_rows->set_rows(in_selected_rows.rows()); + trans_selected_rows->mutable_value()->ShareDataWith(tensor); + } else { + PADDLE_THROW("unknown var type"); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..70d3a174accc8beda06c550bc5ac9ee97897eb2e --- /dev/null +++ b/paddle/fluid/framework/data_transform.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +void DataTransform(const OpKernelType& expected_kernel_type, + const OpKernelType& kernel_type_for_var, + const Tensor& input_tensor, Tensor* out); + +void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, + Variable& out_var); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..7a527f0d0c12806045d21b1cf279ccfd2cf73c8d --- /dev/null +++ b/paddle/fluid/framework/data_type.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +inline proto::DataType ToDataType(std::type_index type) { + using namespace paddle::framework::proto; + if (typeid(float).hash_code() == type.hash_code()) { + return DataType::FP32; + } else if (typeid(double).hash_code() == type.hash_code()) { + return DataType::FP64; + } else if (typeid(int).hash_code() == type.hash_code()) { + return DataType::INT32; + } else if (typeid(int64_t).hash_code() == type.hash_code()) { + return DataType::INT64; + } else if (typeid(bool).hash_code() == type.hash_code()) { + return DataType::BOOL; + } else { + PADDLE_THROW("Not supported"); + } +} + +inline std::type_index ToTypeIndex(proto::DataType type) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP32: + return typeid(float); + case DataType::FP64: + return typeid(double); + case DataType::INT32: + return typeid(int); + case DataType::INT64: + return typeid(int64_t); + case DataType::BOOL: + return typeid(bool); + default: + PADDLE_THROW("Not support type %d", type); + } +} + +template +inline void VisitDataType(proto::DataType type, Visitor visitor) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP32: + visitor.template operator()(); + break; + case DataType::FP64: + visitor.template operator()(); + break; + case DataType::INT32: + visitor.template operator()(); + break; + case DataType::INT64: + visitor.template operator()(); + break; + case DataType::BOOL: + visitor.template operator()(); + break; + default: + PADDLE_THROW("Not supported"); + } +} + +inline std::string DataTypeToString(const proto::DataType type) { + using namespace paddle::framework::proto; + switch (type) { + case DataType::FP16: + return "float16"; + case DataType::FP32: + return "float32"; + case DataType::FP64: + return "float64"; + case DataType::INT16: + return "int16"; + case DataType::INT32: + return "int32"; + case DataType::INT64: + return "int64"; + case DataType::BOOL: + return "bool"; + default: + PADDLE_THROW("Not support type %d", type); + } +} + +inline std::ostream& operator<<(std::ostream& out, + const proto::DataType& type) { + out << DataTypeToString(type); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..6921927305aa3a7dee801ead888737a1ab93fc8e --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void operator()() { + auto* in_begin = in_.data(); + auto* in_end = in_begin + in_.numel(); + auto* out_begin = out_->mutable_data(in_.place()); + + if (platform::is_cpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + } else { + // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type? + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + out->Resize(in.dims()); + auto src_type = kernel_type_for_var.data_type_; + auto dst_type = expected_kernel_type.data_type_; + auto ctx = pool.Get(in.place()); + + switch (src_type) { + case proto::DataType::FP32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::FP64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::INT32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::INT64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::DataType::BOOL: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..830cced093913839ddef2841b3de1017dc2bc426 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +using KernelTypePair = std::pair; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..88dbc51b21718e2261f1c9485177621a827485e2 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, CPUTransform) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto place = CPUPlace(); + + Tensor in; + Tensor out; + + float* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i / 3; + } + + auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int32 = OpKernelType(proto::DataType::INT32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + + TransDataType(kernel_fp32, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(i / 3)); + } + + TransDataType(kernel_fp32, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(i / 3)); + } +} diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc new file mode 100644 index 0000000000000000000000000000000000000000..f063ee2e6dd81e66b7b74aa23e9967d865c4d297 --- /dev/null +++ b/paddle/fluid/framework/ddim.cc @@ -0,0 +1,318 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +/// @cond HIDDEN + +template +Dim make_dim(const int64_t* d) { + return Dim(*d, make_dim(d + 1)); +} + +template <> +Dim<1> make_dim<1>(const int64_t* d) { + return Dim<1>(*d); +} + +void make_ddim(DDim& ddim, const int64_t* dims, int n) { + switch (n) { + case 1: + ddim = make_dim<1>(dims); + break; + case 2: + ddim = make_dim<2>(dims); + break; + case 3: + ddim = make_dim<3>(dims); + break; + case 4: + ddim = make_dim<4>(dims); + break; + case 5: + ddim = make_dim<5>(dims); + break; + case 6: + ddim = make_dim<6>(dims); + break; + case 7: + ddim = make_dim<7>(dims); + break; + case 8: + ddim = make_dim<8>(dims); + break; + case 9: + ddim = make_dim<9>(dims); + break; + default: + PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); + } +} + +/// @endcond + +DDim make_ddim(std::initializer_list dims) { + DDim result(make_dim(0)); + make_ddim(result, dims.begin(), dims.size()); + return result; +} + +DDim make_ddim(const std::vector& dims) { + DDim result(make_dim(0)); + make_ddim(result, &dims[0], dims.size()); + return result; +} + +DDim make_ddim(const std::vector& dims) { + std::vector res(dims.size()); + std::transform(dims.begin(), dims.end(), res.begin(), + [](int d) { return static_cast(d); }); + return make_ddim(res); +} + +/// @cond HIDDEN +// XXX For some reason, putting this in an anonymous namespace causes errors +class DynamicMutableIndexer : public boost::static_visitor { + public: + explicit DynamicMutableIndexer(int idx) : idx_(idx) {} + + template + int64_t& operator()(Dim& dim) const { + return dim[idx_]; + } + + private: + int idx_; +}; + +class DynamicConstIndexer : public boost::static_visitor { + public: + explicit DynamicConstIndexer(int idx) : idx_(idx) {} + + template + int64_t operator()(const Dim& dim) const { + return dim[idx_]; + } + + private: + int idx_; +}; + +/// @endcond + +int64_t& DDim::operator[](int idx) { + return boost::apply_visitor(DynamicMutableIndexer(idx), var); +} + +int64_t DDim::operator[](int idx) const { + return boost::apply_visitor(DynamicConstIndexer(idx), var); +} + +int DDim::size() const { return arity(*this); } + +bool DDim::operator==(DDim d) const { + if (var.which() != d.getVar().which()) { + return false; + } else { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + for (unsigned int i = 0; i < v1.size(); i++) { + if (v1[i] != v2[i]) { + return false; + } + } + + return true; + } +} + +bool DDim::operator!=(DDim d) const { return !(*this == d); } + +DDim DDim::operator+(DDim d) const { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + std::vector v3; + + assert(v1.size() == v2.size()); + + for (unsigned int i = 0; i < v1.size(); i++) { + v3.push_back(v1[i] + v2[i]); + } + + return make_ddim(v3); +} + +DDim DDim::operator*(DDim d) const { + std::vector v1 = vectorize(*this); + std::vector v2 = vectorize(d); + + std::vector v3; + + assert(v1.size() == v2.size()); + + for (unsigned int i = 0; i < v1.size(); i++) { + v3.push_back(v1[i] * v2[i]); + } + + return make_ddim(v3); +} + +int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } + +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } + +/// @cond HIDDEN +struct VectorizeVisitor : public boost::static_visitor<> { + std::vector& vector; + + explicit VectorizeVisitor(std::vector& v) : vector(v) {} + + template + void operator()(const T& t) { + vector.push_back(t.head); + this->operator()(t.tail); + } + + void operator()(const Dim<1>& t) { vector.push_back(t.head); } +}; +/// @endcond + +std::vector vectorize(const DDim& ddim) { + std::vector result; + VectorizeVisitor visitor(result); + boost::apply_visitor(visitor, ddim); + return result; +} + +// NOTE: framework::vectorize converts to type int64_t +// which does not fit cudnn inputs. +std::vector vectorize2int(const DDim& ddim) { + std::vector temp = vectorize(ddim); + std::vector result(temp.begin(), temp.end()); + return result; +} + +struct ProductVisitor : public boost::static_visitor { + template + int64_t operator()(const Dim& dim) { + return product(dim); + } +}; + +int64_t product(const DDim& ddim) { + ProductVisitor visitor; + return boost::apply_visitor(visitor, ddim); +} + +struct SliceVectorizeVisitor : public boost::static_visitor<> { + std::vector& vector; + int begin; + int end; + + SliceVectorizeVisitor(std::vector& v, int b, int e) + : vector(v), begin(b), end(e) { + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); + } + + template + void operator()(const Dim& dim) { + if (begin == 0) { + vector.push_back(dim.head); + } else { + --begin; + } + --end; + if (end > 0) { + this->operator()(dim.tail); + } + } + + void operator()(const Dim<1>& dim) { + PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound."); + vector.push_back(dim.head); + } +}; + +DDim slice_ddim(const DDim& dim, int begin, int end) { + std::vector vec; + vec.reserve(end - begin); + SliceVectorizeVisitor visitor(vec, begin, end); + boost::apply_visitor(visitor, dim); + return make_ddim(vec); +} + +/// \cond HIDDEN + +struct ArityVisitor : boost::static_visitor { + template + int operator()(Dim) const { + return D; + } +}; + +/// \endcond + +int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } + +/// \cond HIDDEN + +struct DDimPrinter : boost::static_visitor { + std::ostream& os; + explicit DDimPrinter(std::ostream& os_) : os(os_) {} + + template + void operator()(const T& t) { + os << t; + } +}; + +/// \endcond + +std::ostream& operator<<(std::ostream& os, const DDim& ddim) { + DDimPrinter printer(os); + boost::apply_visitor(printer, ddim); + return os; +} + +DDim::DDim(std::initializer_list init_list) { + *this = make_ddim(init_list); +} + +DDim flatten_to_2d(const DDim& src, int num_col_dims) { + int rank = src.size(); + return make_ddim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, rank))}); +} + +DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } + +DDim stride(const DDim& ddim) { + std::vector strides(ddim.size()); + strides[ddim.size() - 1] = 1; + for (int i = ddim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ddim[i + 1]; + } + return framework::make_ddim(strides); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h new file mode 100644 index 0000000000000000000000000000000000000000..750ab787abb72fa3f2984caf58354e327750aa3d --- /dev/null +++ b/paddle/fluid/framework/ddim.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { + +/** + * \brief A dynamically sized dimension. + * + * The number of dimensions must be between [1, 9]. + */ +struct DDim { + typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, + Dim<8>, Dim<9>> + DDimVar; + DDimVar var; + + DDim() : var(Dim<1>()) {} + + template + explicit DDim(const Dim& in) : var(in) {} + + /*implicit*/ DDim(std::initializer_list init_list); + + template + DDim& operator=(const Dim& in) { + var = in; + return *this; + } + + int64_t& operator[](int idx); + int64_t operator[](int idx) const; + + template + typename Visitor::result_type apply_visitor(Visitor& visitor) { + return var.apply_visitor(visitor); + } + + template + typename Visitor::result_type apply_visitor(Visitor& visitor) const { + return var.apply_visitor(visitor); + } + + DDimVar getVar() { return var; } + + bool operator==(DDim d) const; + + bool operator!=(DDim d) const; + + DDim operator+(DDim d) const; + + DDim operator*(DDim d) const; + + int size() const; +}; + +/** + * \brief Make a DDim from std::vector + * + * \param dims An vector of ints. Must be sized between [1, 9] + */ +DDim make_ddim(const std::vector& dims); + +DDim make_ddim(const std::vector& dims); + +/** + * \brief Make a DDim from an initializer list + * + * \param dims An initializer list of ints. Must be sized between [1, 9] + * + */ +DDim make_ddim(std::initializer_list dims); + +int64_t get(const DDim& dim, int idx); +void set(DDim& dim, int idx, int val); + +std::vector vectorize(const DDim& ddim); +std::vector vectorize2int(const DDim& ddim); + +int64_t product(const DDim& ddim); + +/** + * \brief Slice a ddim + * + * Slice dim with [begin, end). + * e.g. DDim d = make_ddim({1,2,3,4,5}); + * slice_ddim(d, 1, 3); ====> {2,3} + */ +DDim slice_ddim(const DDim& dim, int begin, int end); + +/** + * \brief What is the length of this dimension? + * + * \param Dynamic dimension to inspect + */ + +int arity(const DDim& ddim); + +std::ostream& operator<<(std::ostream&, const DDim&); + +// Reshape a tensor to a matrix. The matrix's first dimension(column length) +// will be the product of tensor's first `num_col_dims` dimensions. +DDim flatten_to_2d(const DDim& src, int num_col_dims); + +DDim flatten_to_1d(const DDim& src); + +DDim stride(const DDim& ddim); +} // namespace framework +} // namespace paddle + +namespace boost { + +template +T get(const paddle::framework::DDim& in) { + return boost::get(in.var); +} + +} // namespace boost diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..18d305a4036840066a7d9c999a7e73db863274d7 --- /dev/null +++ b/paddle/fluid/framework/ddim_test.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" + +TEST(DDim, Equality) { + // construct a DDim from an initialization list + paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5}); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // construct a DDim from a vector + std::vector vec({9, 1, 5}); + paddle::framework::DDim vddim = paddle::framework::make_ddim(vec); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // mutate a DDim + ddim[1] = 2; + EXPECT_EQ(ddim[1], 2); + paddle::framework::set(ddim, 0, 6); + EXPECT_EQ(paddle::framework::get(ddim, 0), 6); + + // vectorize a DDim + std::vector res_vec = paddle::framework::vectorize(vddim); + EXPECT_EQ(res_vec[0], 9); + EXPECT_EQ(res_vec[1], 1); + EXPECT_EQ(res_vec[2], 5); + paddle::framework::Dim<3> d(3, 2, 1); + res_vec = paddle::framework::vectorize(paddle::framework::DDim(d)); + EXPECT_EQ(res_vec[0], 3); + EXPECT_EQ(res_vec[1], 2); + EXPECT_EQ(res_vec[2], 1); + + // add two DDims + paddle::framework::DDim ddim_sum = ddim + vddim; + EXPECT_EQ(ddim_sum[0], 15); + EXPECT_EQ(ddim_sum[1], 3); + EXPECT_EQ(ddim_sum[2], 10); + + // multiply two DDims + paddle::framework::DDim ddim_mul = ddim * vddim; + EXPECT_EQ(ddim_mul[0], 54); + EXPECT_EQ(ddim_mul[1], 2); + EXPECT_EQ(ddim_mul[2], 25); + + // arity of a DDim + EXPECT_EQ(paddle::framework::arity(ddim), 3); + EXPECT_EQ(ddim.size(), 3); + + // product of a DDim + EXPECT_EQ(paddle::framework::product(vddim), 45); + EXPECT_EQ( + paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})), + 90); + + // slice a DDim + paddle::framework::DDim ddim2 = + paddle::framework::make_ddim({1, 2, 3, 4, 5, 6}); + paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5); + EXPECT_EQ(arity(ss), 3); + EXPECT_EQ(ss[0], 3); + EXPECT_EQ(ss[1], 4); + EXPECT_EQ(ss[2], 5); + paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6); + EXPECT_EQ(arity(ss2), 6); + EXPECT_EQ(ss2[0], 1); + EXPECT_EQ(ss2[1], 2); + EXPECT_EQ(ss2[2], 3); + EXPECT_EQ(ss2[3], 4); + EXPECT_EQ(ss2[4], 5); + EXPECT_EQ(ss2[5], 6); +} + +TEST(DDim, Print) { + // print a DDim + std::stringstream ss; + paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4}); + ss << ddim; + EXPECT_EQ("2, 3, 4", ss.str()); +} diff --git a/paddle/fluid/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h new file mode 100644 index 0000000000000000000000000000000000000000..88faf3acf7c17b0cb3770a8910e400a1f6688f5f --- /dev/null +++ b/paddle/fluid/framework/details/buffered_channel.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +// Four of the properties of Buffered Channel: +// - A send to a full channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from an empty channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. + +template +class Buffered : public paddle::framework::Channel { + friend Channel* paddle::framework::MakeChannel(size_t); + friend void paddle::framework::CloseChannel(Channel*); + + public: + virtual bool Send(T*); + virtual bool Receive(T*); + virtual size_t Cap() { return cap_; } + virtual void Close(); + virtual ~Buffered(); + + private: + size_t cap_; + std::mutex mu_; + std::condition_variable empty_cond_var_; + std::condition_variable full_cond_var_; + std::condition_variable destructor_cond_var_; + std::deque channel_; + std::atomic closed_{false}; + std::atomic send_ctr{0}; + std::atomic recv_ctr{0}; + + Buffered(size_t cap) : cap_(cap), closed_(false) { + PADDLE_ENFORCE_GT(cap, 0); + } + + void NotifyAllParticipants(std::unique_lock*); +}; + +template +bool Buffered::Send(T* item) { + bool ret = false; + if (closed_) { + return ret; + } + send_ctr++; + std::unique_lock lock(mu_); + full_cond_var_.wait(lock, + [this]() { return channel_.size() < cap_ || closed_; }); + if (!closed_) { + channel_.push_back(std::move(*item)); + lock.unlock(); + empty_cond_var_.notify_one(); + ret = true; + } + send_ctr--; + destructor_cond_var_.notify_one(); + return ret; +} + +template +bool Buffered::Receive(T* item) { + bool ret = false; + // Once the channel has been closed and all data has been consumed, + // just return false. Don't even try acquiring the mutex. + if (closed_ && channel_.empty()) { + return false; + } + recv_ctr++; + std::unique_lock lock(mu_); + empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); + if (!channel_.empty()) { + *item = std::move(channel_.front()); + channel_.pop_front(); + full_cond_var_.notify_one(); + ret = true; + } + recv_ctr--; + destructor_cond_var_.notify_one(); + return ret; +} + +template +void Buffered::Close() { + if (closed_) { + return; + } + std::unique_lock lock(mu_); + closed_ = true; + NotifyAllParticipants(&lock); +} + +template +Buffered::~Buffered() { + std::unique_lock lock(mu_); + closed_ = true; + channel_.clear(); + NotifyAllParticipants(&lock); + + // The destructor must wait for all readers and writers to complete their task + // The channel has been closed, so we will not accept new readers and writers + lock.lock(); + destructor_cond_var_.wait( + lock, [this]() { return send_ctr == 0 && recv_ctr == 0; }); +} + +template +void Buffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + full_cond_var_.notify_all(); + empty_cond_var_.notify_all(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h similarity index 100% rename from paddle/framework/details/cow_ptr.h rename to paddle/fluid/framework/details/cow_ptr.h diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2142af277c0b356d83941b3baab1947cce31dac --- /dev/null +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/details/cow_ptr.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(COWPtr, all) { + COWPtr ptr(new int{0}); + ASSERT_EQ(ptr.Data(), 0); + COWPtr ptr2 = ptr; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(&ptr2.Data(), &ptr.Data()); + *ptr2.MutableData() = 10; + ASSERT_EQ(ptr.Data(), 0); + ASSERT_EQ(ptr2.Data(), 10); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..d73604ad185a66ade0168f585d1951d0d7d4a5f9 --- /dev/null +++ b/paddle/fluid/framework/details/op_registry.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { +namespace details { + +enum OpInfoFillType { + kOperator = 0, + kOpProtoAndCheckerMaker = 1, + kGradOpDescMaker = 2, + kVarTypeInference = 3, + kShapeInference = 4 +}; + +template +struct OpInfoFillTypeID { + static constexpr OpInfoFillType ID() { + return std::is_base_of::value + ? kOperator + : (std::is_base_of::value + ? kOpProtoAndCheckerMaker + : (std::is_base_of::value + ? kGradOpDescMaker + : (std::is_base_of::value + ? kVarTypeInference + : (std::is_base_of::value + ? kShapeInference + : static_cast( + -1))))); + } +}; + +template ::ID()> +struct OpInfoFiller; + +template +class OperatorRegistrarRecursive; + +template +class OperatorRegistrarRecursive { + public: + using T = typename std::tuple_element>::type; + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) { + OpInfoFiller fill; + fill(op_type, info); + constexpr auto size = sizeof...(ARGS); + OperatorRegistrarRecursive reg(op_type, + info); + (void)(reg); + } +}; + +template +class OperatorRegistrarRecursive { + public: + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {} +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->creator_ = [](const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + return new T(type, inputs, outputs, attrs); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->proto_ = new proto::OpProto; + info->checker_ = new OpAttrChecker(); + auto maker = T(info->proto_, info->checker_); + maker.Validate(); + info->proto_->set_type(op_type); + PADDLE_ENFORCE( + info->proto_->IsInitialized(), + "Fail to initialize %s's OpProto, because %s is not initialized", + op_type, info->proto_->InitializationErrorString()); + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->grad_op_maker_ = []( + const OpDesc& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block) { + T maker(fwd_op, no_grad_set, grad_to_var, grad_block); + return maker(); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { + T inference; + inference(fwd_op, block); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_shape_ = [](InferShapeContext* ctx) { + T inference; + inference(ctx); + }; + } +}; + +} // namespace details + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h new file mode 100644 index 0000000000000000000000000000000000000000..5c9424928cb7029aac813e7b2f29f81a0093f836 --- /dev/null +++ b/paddle/fluid/framework/details/unbuffered_channel.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/channel.h" + +namespace paddle { +namespace framework { +namespace details { + +// Four of the properties of UnBuffered Channel: +// - A send to a channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from a channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. +template +class UnBuffered : public paddle::framework::Channel { + friend Channel* paddle::framework::MakeChannel(size_t); + friend void paddle::framework::CloseChannel(Channel*); + + public: + virtual bool Send(T*); + virtual bool Receive(T*); + virtual size_t Cap() { return 0; } + virtual void Close(); + virtual ~UnBuffered(); + + private: + std::mutex mu_ch_; + // Mutex for readers and writers who are waiting for other reader + // and writer to complete execution + std::recursive_mutex mu_read_, mu_write_; + // reader_found_ is set true when a reader is ready to accept data + // writer_found_ is set true when a writer is ready to send data + // A transaction occurs only when both are true + std::atomic reader_found_{false}, writer_found_{false}; + std::condition_variable cv_channel_; + std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_; + T* item{nullptr}; + std::atomic closed_{false}; + std::atomic send_ctr{0}; + std::atomic recv_ctr{0}; + + UnBuffered() : closed_(false) {} + + void NotifyAllParticipants(std::unique_lock*); +}; + +// This function implements the concept of how data should +// be sent from a writer to a reader. +template +bool UnBuffered::Send(T* data) { + bool ret = false; + if (closed_) { + return ret; + } + send_ctr++; + // Prevent other writers from entering + std::unique_lock writer_lock(mu_write_); + writer_found_ = true; + std::unique_lock cv_lock(mu_write_); + // If writer comes first, it should wait till a reader arrives + cv_writer_.wait(cv_lock, + [this]() { return reader_found_ == true || closed_; }); + cv_reader_.notify_one(); + if (!closed_) { + std::unique_lock channel_lock(mu_ch_); + item = data; + channel_lock.unlock(); + cv_channel_.notify_one(); + channel_lock.lock(); + cv_channel_.wait(channel_lock, + [this]() { return item == nullptr || closed_; }); + ret = true; + } + writer_found_ = false; + send_ctr--; + cv_destructor_.notify_one(); + return ret; +} + +// This function implements the concept of how +// data that was sent by a writer is read from a reader. +template +bool UnBuffered::Receive(T* data) { + bool ret = false; + // If channel is closed, we don't even want any reader to enter. + // Unlike a buffered channel, an unbuffered channel does not allow + // readers to read after closing because there is no buffer to be consumed. + if (closed_) return ret; + recv_ctr++; + // Prevent other readers from entering + std::unique_lock read_lock{mu_read_}; + reader_found_ = true; + std::unique_lock cv_lock{mu_read_}; + // If reader comes first, it should wait till a writer arrives + cv_reader_.wait(cv_lock, + [this]() { return writer_found_ == true || closed_; }); + cv_writer_.notify_one(); + if (!closed_) { + std::unique_lock lock_ch{mu_ch_}; + // Reader should wait for the writer to first write its data + cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; }); + if (!closed_) { + *data = std::move(*item); + item = nullptr; + lock_ch.unlock(); + ret = true; + } + cv_channel_.notify_one(); + } + reader_found_ = false; + recv_ctr--; + cv_destructor_.notify_one(); + return ret; +} + +// This function implements the sequence of events +// that take place once the channel is closed. +template +void UnBuffered::Close() { + if (closed_) { + return; + } + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); +} + +// This function implements the sequence of events +// that are executed once the object of an UnBuffered +// channel is destroyed. +template +UnBuffered::~UnBuffered() { + std::unique_lock lock(mu_ch_); + item = nullptr; + closed_ = true; + NotifyAllParticipants(&lock); + lock.lock(); + cv_destructor_.wait(lock, + [this]() { return send_ctr == 0 && recv_ctr == 0; }); +} + +// This function notifies all the readers, writers and +// the channel condition variables. +template +void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) { + lock->unlock(); + cv_writer_.notify_all(); + cv_channel_.notify_all(); + cv_reader_.notify_all(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h new file mode 100644 index 0000000000000000000000000000000000000000..3938fd3df5b54443fcbaebf600840ccf2337a173 --- /dev/null +++ b/paddle/fluid/framework/dim.h @@ -0,0 +1,421 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +// Statically sized, statically indexed dimension +template +struct Dim { + static constexpr int dimensions = i; + + template + HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { + static_assert(sizeof...(_tail) == i - 1, + "Dim initialized with the wrong number of parameters"); + } + + HOSTDEVICE + Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + + HOSTDEVICE + Dim() : head(0), tail() {} + + /** Construct a Dim from a linear index and size. Uses Fortran order + * indexing. */ + HOSTDEVICE + Dim(int64_t idx, const Dim& size) + : head(idx % size.head), tail(idx / size.head, size.tail) {} + + /** Construct a Dim with each dimension set to the given index */ + HOSTDEVICE + Dim(int64_t idx) : head(idx), tail(idx) {} + + HOSTDEVICE + bool operator==(const Dim& o) const { + return (head == o.head) && (tail == o.tail); + } + + HOSTDEVICE + bool operator!=(const Dim& o) const { return !(*this == o); } + + HOSTDEVICE + int64_t& operator[](int idx); + HOSTDEVICE + int64_t operator[](int idx) const; + + HOST std::string to_string() const; + + int64_t head; + Dim tail; +}; + +// Base case specialization +template <> +struct Dim<1> { + static constexpr int dimensions = 1; + + HOSTDEVICE + Dim(int64_t _head) : head(_head) {} + + HOSTDEVICE + Dim() : head(0) {} + + HOSTDEVICE + Dim(int idx, const Dim<1>& size) : head(idx) { +#ifndef __CUDA_ARCH__ + if (idx >= size.head) { + throw std::invalid_argument("Index out of range."); + } +#else + PADDLE_ASSERT(idx < size.head); +#endif + } + + HOSTDEVICE + bool operator==(const Dim<1>& o) const { return (head == o.head); } + + HOSTDEVICE + bool operator!=(const Dim<1>& o) const { return !(*this == o); } + + HOSTDEVICE + int64_t& operator[](int idx); + HOSTDEVICE + int64_t operator[](int idx) const; + + int64_t head; +}; + +namespace { + +// Helper for accessing Dim classes +template +struct DimGetter { + // Return a copy if Dim is const + template + HOSTDEVICE static int64_t impl(const D& d) { + return DimGetter::impl(d.tail); + } + // Return a reference if Dim is mutable + template + HOSTDEVICE static int64_t& impl(D& d) { + return DimGetter::impl(d.tail); + } +}; + +// Eureka! We found the element! +template <> +struct DimGetter<0> { + // Return a copy if Dim is const + template + HOSTDEVICE static int64_t impl(const D& d) { + return d.head; + } + // Return a reference if Dim is mutable + template + HOSTDEVICE static int64_t& impl(D& d) { + return d.head; + } +}; + +template +HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx < 0) { + throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx != 0) { + throw std::invalid_argument("Invalid index"); + } +#else + PADDLE_ASSERT(idx == 0); +#endif + return dim.head; +} + +template +HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx < 0) { + throw std::invalid_argument("Tried to access a negative dimension"); + } +#else + PADDLE_ASSERT(idx >= 0); +#endif + if (idx == 0) { + return dim.head; + } + return indexer(dim.tail, idx - 1); +} + +template <> +HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) { +#ifndef __CUDA_ARCH__ + if (idx != 0) { + throw std::invalid_argument("Invalid index"); + } +#else + PADDLE_ASSERT(idx == 0); +#endif + return dim.head; +} + +} // namespace +// Static access to constant Dim +template +HOSTDEVICE int64_t get(const Dim& d) { + return DimGetter::impl(d); +} + +// Static access to mutable Dim +template +HOSTDEVICE int64_t& get(Dim& d) { + return DimGetter::impl(d); +} + +// Dynamic access to constant Dim +template +HOSTDEVICE int64_t Dim::operator[](int i) const { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +template +HOSTDEVICE int64_t& Dim::operator[](int i) { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const { + return indexer(*this, i); +} + +// Dynamic access to mutable Dim +inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) { + return indexer(*this, i); +} + +// Dynamic access to constant Dim +// without std::enable_if will try to instantiate this on get<0>(d) +template +HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, + int i) { + return d[i]; +} + +// Dynamic access to mutable Dim +template +HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, + int i) { + return d[i]; +} + +// Dot product of two dims +template +HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { + return a.head * b.head + linearize(a.tail, b.tail); +} + +// Base case dot product of two Dims +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) { + return a.head * b.head; +} + +// Product of a Dim +template +HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { + return prod * a.head * product(a.tail); +} + +// Base case product of a Dim +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) { + return prod * a.head; +} + +// Is 0 <= idx_i < size_i for all i? +template +HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { + return ((0 <= idx.head) && (idx.head < size.head) && + contained(idx.tail, size.tail)); +} + +// Base case of is 0 <= idx_i < size_i ? +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) { + return ((0 <= idx.head) && (idx.head < size.head)); +} + +/** + * \brief Compute exclusive prefix-multiply of a Dim. + */ +template +HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { + return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); +} + +///\cond HIDDEN +// Base case of ex_prefix_mul +// Notice it is inline because it is no longer a template +template <> +HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) { + return Dim<1>(mul); +} +///\endcond + +/** + * Add two dimensions together + */ +template +HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { + return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +} + +// Base case +template <> +HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) { + return Dim<1>(a.head + b.head); +} + +template +HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { + return dim_plus(lhs, rhs); +} + +/** + * Multiply two dimensions together + */ +template +HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { + return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); +} + +// Base case +template <> +HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) { + return Dim<1>(a.head * b.head); +} + +template +HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { + return dim_mult(lhs, rhs); +} + +/** + * \brief Normalize strides to ensure any dimension with extent 1 + * has stride 0. + * + * \param size Dim object containing the size of an array + * \param stride Dim object containing stride of an array + * \return Dim object the same size as \p size with normalized strides + * + */ + +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + int norm_stride = size.head == 1 ? 0 : stride.head; + return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); +} + +///\cond HIDDEN + +template <> +HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size, + const Dim<1>& stride) { + int norm_stride = size.head == 1 ? 0 : stride.head; + return Dim<1>(norm_stride); +} + +///\endcond + +/** + * Helper function to create a Dim + * + * \param idxes The type of Dim constructed depends on the number of params + * + */ + +template +HOSTDEVICE Dim make_dim(Args... idxes) { + return Dim(idxes...); +} + +// Allows us to output a Dim +// XXX For some reason, overloading fails to resolve this correctly +template +typename std::enable_if<(i > 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head << ", " << d.tail; + return os; +} + +// Base case that allows us to output a Dim +// XXX I wish this could be an overload instead of a template +template +typename std::enable_if<(i == 1), std::ostream&>::type operator<<( + std::ostream& os, const Dim& d) { + os << d.head; + return os; +} + +template +HOST std::string Dim::to_string() const { + std::stringstream stream; + + stream << *this; + + return stream.str(); +} + +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { + Dim result; + + for (int i = 0; i < D - 1; ++i) { + result[i] = linear_index % extents[i]; + linear_index /= extents[i]; + } + + result[D - 1] = linear_index; + + return result; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f1969d79775ba70661806d589ae3de2696b77e8 --- /dev/null +++ b/paddle/fluid/framework/dim_test.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/dim.h" + +__global__ void test(paddle::framework::Dim<2>* o) { + o[0] = paddle::framework::make_dim(5, 6); +} + +__global__ void dyn_idx_gpu(int64_t* o) { + auto d = paddle::framework::make_dim(5, 6); + o[0] = d[1]; +} + +TEST(Dim, Equality) { + // construct a Dim on the CPU + auto a = paddle::framework::make_dim(3, 4); + EXPECT_EQ(paddle::framework::get<0>(a), 3); + EXPECT_EQ(paddle::framework::get<1>(a), 4); + + // construct a Dim on the GPU + thrust::device_vector> t(2); + test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); + a = t[0]; + EXPECT_EQ(paddle::framework::get<0>(a), 5); + EXPECT_EQ(paddle::framework::get<1>(a), 6); + + // linearization + auto b = paddle::framework::make_dim(7, 8); + EXPECT_EQ(paddle::framework::linearize(a, b), 83); + + // product + EXPECT_EQ(paddle::framework::product(a), 30); + + // mutate a Dim + paddle::framework::get<1>(b) = 10; + EXPECT_EQ(paddle::framework::get<0>(b), 7); + EXPECT_EQ(paddle::framework::get<1>(b), 10); + + // dynamic access + paddle::framework::get(b, 0) = 8; + b[1] = 11; + EXPECT_EQ(paddle::framework::get<0>(b), 8); + EXPECT_EQ(paddle::framework::get<1>(b), 11); + EXPECT_EQ(paddle::framework::get(b, 0), 8); + EXPECT_EQ(b[1], 11); + + // dynamic access on GPU + thrust::device_vector r(1); + dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); + int64_t res = r[0]; + EXPECT_EQ(res, 6); + + // ex_prefix_mul + paddle::framework::Dim<3> c = + paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5)); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 12); + + // generate from an index + auto size = paddle::framework::make_dim(4, 5, 2); + c = paddle::framework::Dim<3>(14, size); + EXPECT_EQ(paddle::framework::get<0>(c), 2); + EXPECT_EQ(paddle::framework::get<1>(c), 3); + EXPECT_EQ(paddle::framework::get<2>(c), 0); + c = paddle::framework::Dim<3>(25, size); + EXPECT_EQ(paddle::framework::get<0>(c), 1); + EXPECT_EQ(paddle::framework::get<1>(c), 1); + EXPECT_EQ(paddle::framework::get<2>(c), 1); +} + +TEST(Dim, Bool) { + auto a = paddle::framework::make_dim(3, 4); + auto b = paddle::framework::make_dim(5, 6); + auto c = paddle::framework::make_dim(3, 4); + + // in_bounds check + EXPECT_TRUE(paddle::framework::contained(a, b)); + EXPECT_FALSE(paddle::framework::contained(b, a)); + + // comparison + EXPECT_TRUE(a == a); + EXPECT_FALSE(a == b); + EXPECT_TRUE(a == c); +} + +TEST(Dim, Print) { + { + std::stringstream ss; + auto a = paddle::framework::make_dim(2, 3); + ss << a; + EXPECT_EQ(ss.str(), "2, 3"); + } + { + std::stringstream ss; + ss << paddle::framework::make_dim(8); + EXPECT_EQ(ss.str(), "8"); + } +} diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h new file mode 100644 index 0000000000000000000000000000000000000000..d1b8c701a7941813c0fbf441b8a6c7f4d3811a6d --- /dev/null +++ b/paddle/fluid/framework/eigen.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace framework { + +// EigenDim converts paddle::platform::DDim into Eigen::DSizes. +template +struct EigenDim { + using Type = Eigen::DSizes; + + static Type From(const DDim& dims) { + PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)"); + Type ret; + for (int64_t d = 0; d < arity(dims); d++) { + ret[d] = dims[d]; + } + return ret; + } +}; + +// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. +template +struct EigenTensor { + // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on + // the speed of aligned and unaligned version in future. + using Type = Eigen::TensorMap>; + + using ConstType = + Eigen::TensorMap>; + + static Type From(Tensor& tensor, DDim dims) { + return Type(tensor.data(), EigenDim::From(dims)); + } + + static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); } + + static ConstType From(const Tensor& tensor, DDim dims) { + return ConstType(tensor.data(), EigenDim::From(dims)); + } + + static ConstType From(const Tensor& tensor) { + return From(tensor, tensor.dims_); + } +}; + +template +struct EigenMatrix : public EigenTensor { + static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } + + static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, + int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } +}; + +template +struct EigenVector : public EigenTensor { + // Flatten reshapes a Tensor into an EigenVector. + static typename EigenVector::Type Flatten(Tensor& tensor) { + return EigenVector::From(tensor, {product(tensor.dims_)}); + } + + static typename EigenVector::ConstType Flatten(const Tensor& tensor) { + return EigenVector::From(tensor, {product(tensor.dims_)}); + } +}; + +template +struct EigenScalar { + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + using Type = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + using ConstType = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + + static Type From(Tensor& tensor) { return Type(tensor.data()); } + + static ConstType From(const Tensor& tensor) { + return ConstType(tensor.data()); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9e3abeccb34fd1778cac6918d24e30021b433e9 --- /dev/null +++ b/paddle/fluid/framework/eigen_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/eigen.h" +#include + +namespace paddle { +namespace framework { + +TEST(EigenDim, From) { + EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3})); + ASSERT_EQ(1, ed[0]); + ASSERT_EQ(2, ed[1]); + ASSERT_EQ(3, ed[2]); +} + +TEST(Eigen, Tensor) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenTensor::Type et = EigenTensor::From(t); + + ASSERT_EQ(1, et.dimension(0)); + ASSERT_EQ(2, et.dimension(1)); + ASSERT_EQ(3, et.dimension(2)); + + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 2; j++) { + for (int k = 0; k < 3; k++) { + ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f); + } + } + } +} + +TEST(Eigen, ScalarFrom) { + Tensor t; + int* p = t.mutable_data(make_ddim({1}), platform::CPUPlace()); + *p = static_cast(100); + + EigenScalar::Type es = EigenScalar::From(t); + + ASSERT_EQ(0, es.dimension(0)); + ASSERT_EQ(100, es(0)); +} + +TEST(Eigen, VectorFrom) { + Tensor t; + float* p = t.mutable_data(make_ddim({6}), platform::CPUPlace()); + for (int i = 0; i < 6; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::From(t); + + ASSERT_EQ(6, ev.dimension(0)); + + for (int i = 0; i < 6; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, VectorFlatten) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::Flatten(t); + + ASSERT_EQ(1 * 2 * 3, ev.dimension(0)); + + for (int i = 0; i < 1 * 2 * 3; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, Matrix) { + Tensor t; + float* p = t.mutable_data(make_ddim({2, 3}), platform::CPUPlace()); + for (int i = 0; i < 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::From(t); + + ASSERT_EQ(2, em.dimension(0)); + ASSERT_EQ(3, em.dimension(1)); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 3; j++) { + ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f); + } + } +} + +TEST(Eigen, MatrixReshape) { + Tensor t; + float* p = t.mutable_data({2, 3, 6, 4}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 6 * 4; ++i) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::Reshape(t, 2); + + ASSERT_EQ(2 * 3, em.dimension(0)); + ASSERT_EQ(6 * 4, em.dimension(1)); + + for (int i = 0; i < 2 * 3; i++) { + for (int j = 0; j < 6 * 4; j++) { + ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..816ad8d6590a1af3a043cfef5da1edee5119575d --- /dev/null +++ b/paddle/fluid/framework/executor.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor.h" + +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(benchmark); +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); + +namespace paddle { +namespace framework { + +Executor::Executor(const platform::Place& place) : place_(place) {} + +static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { + if (var_type == proto::VarDesc::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarDesc::READER) { + var->GetMutable(); + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER]", + var_type); + } +} + +static void CheckTensorNANOrInf(const std::string& name, + const framework::Tensor& tensor) { + if (tensor.memory_size() == 0) { + return; + } + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { + return; + } + PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); +} + +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, + bool create_local_scope, bool create_vars) { + // TODO(tonyyang-svail): + // - only runs on the first device (i.e. no interdevice communication) + // - will change to use multiple blocks for RNN op and Cond Op + PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); + auto& block = pdesc.Block(block_id); + + Scope* local_scope = scope; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : block.AllVars()) { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } // if (create_local_scope) + } // if (create_vars) + + for (auto& op_desc : block.AllOps()) { + auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + VLOG(4) << op->DebugStringEx(local_scope); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(op->Type(), pool.Get(place_)); + + op->Run(*local_scope, place_); + VLOG(3) << op->DebugStringEx(local_scope); + if (FLAGS_benchmark) { + VLOG(2) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); + } + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } + } + if (create_vars && create_local_scope) { + scope->DeleteScope(local_scope); + } + if (FLAGS_benchmark) { + VLOG(2) << "-------------------------------------------------------"; + VLOG(2) << "Memory used after deleting local scope: " + << memory::memory_usage(place_); + VLOG(2) << "-------------------------------------------------------"; + } +} + +// Check whether the block already has feed operators and feed_holder. +// Return false if the block does not have any feed operators. +// If some feed operators have been prepended to the block, check that +// the info contained in these feed operators matches the feed_targets +// and feed_holder_name. Raise exception when any mismatch is found. +// Return true if the block has feed operators and holder of matching info. +static bool has_feed_operators( + BlockDesc* block, std::map& feed_targets, + const std::string& feed_holder_name) { + size_t feed_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFeedOpType) { + feed_count++; + PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, + "Input to feed op should be '%s'", feed_holder_name); + std::string feed_target_name = op->Output("Out")[0]; + PADDLE_ENFORCE( + feed_targets.find(feed_target_name) != feed_targets.end(), + "Feed operator output name '%s' cannot be found in 'feed_targets'", + feed_target_name); + } + } + + if (feed_count > 0) { + PADDLE_ENFORCE_EQ( + feed_count, feed_targets.size(), + "The number of feed operators should match 'feed_targets'"); + + // When feed operator are present, so should be feed_holder + auto var = block->FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } + + return feed_count > 0; +} + +// Check whether the block already has fetch operators and fetch_holder. +// Return false if the block does not have any fetch operators. +// If some fetch operators have been appended to the block, check that +// the info contained in these fetch operators matches the fetch_targets +// and fetch_holder_name. Raise exception when any mismatch is found. +// Return true if the block has fetch operators and holder of matching info. +static bool has_fetch_operators( + BlockDesc* block, std::map& fetch_targets, + const std::string& fetch_holder_name) { + size_t fetch_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_count++; + PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, + "Output of fetch op should be '%s'", fetch_holder_name); + std::string fetch_target_name = op->Input("X")[0]; + PADDLE_ENFORCE( + fetch_targets.find(fetch_target_name) != fetch_targets.end(), + "Fetch operator input name '%s' cannot be found in 'fetch_targets'", + fetch_target_name); + } + } + + if (fetch_count > 0) { + PADDLE_ENFORCE_EQ( + fetch_count, fetch_targets.size(), + "The number of fetch operators should match 'fetch_targets'"); + + // When fetch operator are present, so should be fetch_holder + auto var = block->FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } + + return fetch_count > 0; +} + +void Executor::Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name, + const std::string& fetch_holder_name) { + auto* copy_program = new ProgramDesc(program); + auto* global_block = copy_program->MutableBlock(0); + + if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) { + // create feed_holder variable + auto* feed_holder = global_block->Var(feed_holder_name); + feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH); + feed_holder->SetPersistable(true); + + int i = 0; + for (auto& feed_target : feed_targets) { + std::string var_name = feed_target.first; + VLOG(3) << "feed target's name: " << var_name; + + // prepend feed op + auto* op = global_block->PrependOp(); + op->SetType(kFeedOpType); + op->SetInput("X", {feed_holder_name}); + op->SetOutput("Out", {var_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + // map the data of feed_targets to feed_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, + idx); + } + } + + if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) { + // create fetch_holder variable + auto* fetch_holder = global_block->Var(fetch_holder_name); + fetch_holder->SetType(proto::VarDesc::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_target : fetch_targets) { + std::string var_name = fetch_target.first; + VLOG(3) << "fetch target's name: " << var_name; + + // append fetch op + auto* op = global_block->AppendOp(); + op->SetType(kFetchOpType); + op->SetInput("X", {var_name}); + op->SetOutput("Out", {fetch_holder_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + Run(*copy_program, scope, 0, true, true); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } + + delete copy_program; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h new file mode 100644 index 0000000000000000000000000000000000000000..893c949939e8db2f5227b940bf721ca7f114db9c --- /dev/null +++ b/paddle/fluid/framework/executor.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +class Executor { + public: + // TODO(dzhwinter) : Do not rely on this function, it will be removed + explicit Executor(const platform::DeviceContext& device) + : Executor(device.GetPlace()) {} + + explicit Executor(const platform::Place& place); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ + void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, + bool create_vars = true); + + void Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + + private: + const platform::Place place_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9bb17355d9bc1fb7f355a4038a4c5831d3530b1 --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "glog/logging.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = + *(g_feed_value->GetMutable>()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index].ShareDataWith(input); + // set lod + feed_inputs[index].set_lod(input.lod()); +} + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index) { + // Since we want to fetch LodTensor from a variable, the variable must + // be created alreadly. + Variable* g_fetch_value = scope.FindVar(var_name); + PADDLE_ENFORCE(g_fetch_value->IsType(), + "Only %s can be invoked by GetFetchVariable", + typeid(FeedFetchList).name()); + auto& fetch_outputs = *g_fetch_value->GetMutable(); + auto& tensor = fetch_outputs[index]; + VLOG(3) << "Fetch " << var_name << " with index " << index + << " shape= " << tensor.dims(); + PADDLE_ENFORCE_LT(index, fetch_outputs.size()); + return tensor; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h new file mode 100644 index 0000000000000000000000000000000000000000..5355c29047e668d1a2ec141b303dd562158b2bb3 --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index); + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h new file mode 100644 index 0000000000000000000000000000000000000000..4281e36b138f66268e2f1e835d4475676d97839d --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +using FeedFetchType = LoDTensor; +using FeedFetchList = std::vector; + +static const std::string kFeedOpType = "feed"; +static const std::string kFetchOpType = "fetch"; +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/framework.proto b/paddle/fluid/framework/framework.proto similarity index 100% rename from paddle/framework/framework.proto rename to paddle/fluid/framework/framework.proto diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h new file mode 100644 index 0000000000000000000000000000000000000000..21dd4e885485f88eeeb034ca45c643f9fadf3163 --- /dev/null +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -0,0 +1,195 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +/* + This functor class is responsible for creating the gradient ops for the given + operator fwd_op. After it is called (through operator()), the pairs of + (gradient variable, corresponding input variable of fwd_op) will be added to + grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its + gradient varialbe will be ignored or kEmptyVarName depending on the template + argument DropEmptyIG in the derived classes. + */ +class GradOpDescMakerBase { + public: + explicit GradOpDescMakerBase( + const OpDesc& fwd_op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block = std::vector()) + : fwd_op_(fwd_op), + no_grad_set_(no_grad_set), + grad_to_var_(grad_to_var), + grad_block_(grad_block) {} + + virtual ~GradOpDescMakerBase() = default; + virtual std::vector> operator()() const = 0; + + protected: + std::vector InputGrad(const std::string& name, + bool drop_empty_grad = true) const { + std::vector ret_val; + auto var_names = this->Input(name); + ret_val.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), + std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + if (no_grad_set_.count(g_name)) { + return kEmptyVarName; + } else { + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + } + }); + if (!drop_empty_grad) { + return ret_val; + } + PADDLE_ENFORCE_LE(var_names.size(), 1UL, + "BUG from operator developer:" + " for input argument with a list of variables, " + " drop_empty_grad is not allowed because it makes" + " the correspondence bewteen a variable and its gradient" + " ambiguous. Use REGISTER_OP_EX to register the op" + " or call InputGrad(?,false) in GradOpDescMaker." + " Op type %s", + fwd_op_.Type()); + + std::vector dropped_ret_val; + dropped_ret_val.reserve(ret_val.size()); + std::copy_if(ret_val.begin(), ret_val.end(), + std::back_inserter(dropped_ret_val), + [](const std::string& str) { return str != kEmptyVarName; }); + return dropped_ret_val; + } + + std::vector OutputGrad(const std::string& name) const { + std::vector ret_val; + auto onames = this->Output(name); + ret_val.reserve(onames.size()); + std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + }); + return ret_val; + } + + std::vector InputNames() const { + return this->fwd_op_.InputNames(); + } + + std::vector OutputNames() const { + return this->fwd_op_.OutputNames(); + } + + std::vector Input(const std::string& name) const { + return fwd_op_.Input(name); + } + + std::vector Output(const std::string& name) const { + return fwd_op_.Output(name); + } + + const std::unordered_map& Attrs() const { + return fwd_op_.GetAttrMap(); + } + + const Attribute& GetAttr(const std::string& name) const { + auto& map = fwd_op_.GetAttrMap(); + auto it = map.find(name); + PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name); + return it->second; + } + + template + inline const T& Attr(const std::string& name) const { + return boost::get(GetAttr(name)); + } + + std::string ForwardOpType() const { return this->fwd_op_.Type(); } + + private: + const OpDesc& fwd_op_; + const std::unordered_set& no_grad_set_; + std::unordered_map* grad_to_var_; + + protected: + std::vector grad_block_; +}; + +class SingleGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const { + std::vector> retv; + retv.emplace_back(this->Apply()); + return retv; + } + + protected: + virtual std::unique_ptr Apply() const = 0; +}; + +template +class DefaultGradOpDescMaker : public SingleGradOpDescMaker { + public: + using SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto* grad = new OpDesc(); + grad->SetType(this->GradOpType()); + + for (auto& input_param : this->InputNames()) { + grad->SetInput(input_param, this->Input(input_param)); + grad->SetOutput(GradVarName(input_param), + this->InputGrad(input_param, DropEmptyIG)); + } + + for (auto& output_param : this->OutputNames()) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param)); + } + + grad->SetAttrMap(this->Attrs()); + + return std::unique_ptr(grad); + } + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + +class EmptyGradOpMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + std::vector> operator()() const override { + return {}; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb2d740d8609210064a06cccc5d45c84275e9709 --- /dev/null +++ b/paddle/fluid/framework/init.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include // for strdup +#include +#include +#include + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/string/piece.h" + +namespace paddle { +namespace framework { + +std::once_flag gflags_init_flag; + +void InitGflags(std::vector &argv) { + std::call_once(gflags_init_flag, [&]() { + int argc = argv.size(); + char **arr = new char *[argv.size()]; + std::string line; + for (size_t i = 0; i < argv.size(); i++) { + arr[i] = &argv[i][0]; + line += argv[i]; + line += ' '; + } + google::ParseCommandLineFlags(&argc, &arr, true); + VLOG(1) << "Init commandline: " << line; + }); +} + +void InitDevices() { + /*Init all avaiable devices by default */ + + std::vector places; + places.emplace_back(platform::CPUPlace()); + int count = 0; + +#ifdef PADDLE_WITH_CUDA + try { + count = platform::GetCUDADeviceCount(); + } catch (const std::exception &exp) { + LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; + } +#else + LOG(WARNING) + << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; +#endif + + for (int i = 0; i < count; ++i) { + places.emplace_back(platform::CUDAPlace(i)); + } + + platform::DeviceContextPool::Init(places); +} + +void InitGLOG(const std::string &prog_name) { + // glog will not hold the ARGV[0] inside. + // Use strdup to alloc a new string. + google::InitGoogleLogging(strdup(prog_name.c_str())); + google::InstallFailureSignalHandler(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/init.h b/paddle/fluid/framework/init.h similarity index 100% rename from paddle/framework/init.h rename to paddle/fluid/framework/init.h diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3018541e27a403e1b6e63a8da9eeb1f67915e9e --- /dev/null +++ b/paddle/fluid/framework/init_test.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(InitDevices, CPU) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifndef PADDLE_WITH_CUDA + InitDevices(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U); +#endif +} + +TEST(InitDevices, CUDA) { + using paddle::framework::InitDevices; + using paddle::platform::DeviceContextPool; + +#ifdef PADDLE_WITH_CUDA + int count = paddle::platform::GetCUDADeviceCount(); + InitDevices(); + DeviceContextPool& pool = DeviceContextPool::Instance(); + ASSERT_EQ(pool.size(), 1U + static_cast(count)); +#endif +} diff --git a/paddle/framework/library_type.h b/paddle/fluid/framework/library_type.h similarity index 100% rename from paddle/framework/library_type.h rename to paddle/fluid/framework/library_type.h diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc new file mode 100644 index 0000000000000000000000000000000000000000..31c87492349bff4cd81b101a0e8d44b0516bac46 --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" + +namespace paddle { +namespace framework { +void LoDRankTable::Reset(const LoD& lod, size_t level) { + this->coarse_lod_.clear(); + this->items_.clear(); + PADDLE_ENFORCE(level < lod.size(), + "Cannot rank lod since the level %d is less than lod size %d", + level, lod.size()); + coarse_lod_.reserve(level); + for (size_t i = 0; i < level; ++i) { + coarse_lod_.push_back(lod[i]); + } + auto& vec = lod[level]; + for (size_t i = 0; i < vec.size() - 1; ++i) { + TableItem item; + item.index = i; + item.length = vec[i + 1] - vec[i]; + VLOG(10) << "Add item to rank table " << item.index << " " << item.length; + items_.emplace_back(item); + } + // NOTE(yuyang18): + // + // The time complexity of stable_sort is O(N*log(N)) if additional memory is + // available. It is easy to debug and unit test when using `stable_sort` + // instead of `sort`. Also, the items of a rank table will not be too large. + std::stable_sort(items_.begin(), items_.end(), + [](const TableItem& a, const TableItem& b) { + return a.length > b.length; + }); +} + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table) { + out << "NumOfSequence " << table.items().size() << "\n"; + for (auto& each_item : table.items()) { + out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n"; + } + return out; +} +} // namespace paddle diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h new file mode 100644 index 0000000000000000000000000000000000000000..0eaaf49e4c4d90250b247edf2a8699b8c7c5920d --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { + +// LoD Rank Table stores the `level` of `lod` which is ordered by sequence +// length in descending order. It is useful when implement dynamic RNN and is +// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice +// output operators. +// +// The table item contains two element. The length of sequence and the index of +// sequence in that level. +// +// LoDRankTable also stores the coarse_lod, which is the lod information whose +// level is less than input level, in order to restore the output LoD +// information. +class LoDRankTable { + public: + struct TableItem { + size_t index; + size_t length; + }; + + LoDRankTable() {} + + void Reset(const LoD& lod, size_t level); + + const std::vector& items() const { return this->items_; } + + const LoD& coarse_lod() const { return this->coarse_lod_; } + + size_t level() const { return coarse_lod_.size(); } + + private: + LoD coarse_lod_; + std::vector items_; +}; + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table); + +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..05c67e453d0b8c84aaa8b72ec314153791c73e8f --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.cc @@ -0,0 +1,378 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" + +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +std::ostream &operator<<(std::ostream &os, const LoD &lod) { + os << "{"; + for (auto &v : lod) { + os << "{"; + for (auto &i : v) { + os << i << ","; + } + os << "}"; + } + os << "}"; + + return os; +} + +std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { + PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code()); + + if (!platform::is_cpu_place(t.place())) { + LoDTensor tt; + framework::Copy(t, platform::CPUPlace(), &tt); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(t.place()); + dev_ctx.Wait(); + + os << tt; + return os; + } + + os << "dim: " << t.dims() << "\n"; + os << "lod: " << t.lod() << "\n"; + + // only print first ten elements + int64_t size = t.numel() < 10 ? t.numel() : 10; + for (int64_t i = 0; i < size; ++i) { + os << t.data()[i] << " "; + } + + return os; +} + +std::string LoDToString(const LoD &lod) { + std::ostringstream stream; + stream << lod; + return stream.str(); +} + +LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, + size_t elem_end) { + PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_end, in[level].size()); + + LoD res; + res.resize(in.size() - level); + // copy the first level + res[0].assign(in[level].begin() + elem_begin, + in[level].begin() + elem_end + 1); + for (size_t lvl = 1; lvl < res.size(); lvl++) { + const auto &in_level = in[level + lvl]; + const auto &above_level = res[lvl - 1]; + auto &out_level = res[lvl]; + out_level.assign(in_level.begin() + above_level.front(), + in_level.begin() + above_level.back() + 1); + } + for (size_t lvl = 0; lvl < res.size(); lvl++) { + // to make the first offset equals 0, all the elements minus the first + // element + size_t front = res[lvl].front(); + for (auto &ele : res[lvl]) { + ele -= front; + } + } + return res; +} + +LoD ToAbsOffset(const LoD &in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; + } + } + return result; +} + +bool operator==(const LoD &a, const LoD &b) { + if (a.size() != b.size()) { + return false; + } + + for (size_t i = 0; i < a.size(); i++) { + const auto &a_level = a[i]; + const auto &b_level = b[i]; + if (a_level.size() != b_level.size()) { + return false; + } + for (size_t j = 0; j < a_level.size(); j++) { + if (a_level[j] != b_level[j]) { + return false; + } + } + } + return true; +} + +bool CheckLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + // check: the first offset(the begin offset) of each level should be 0. + if (level.front() != 0) return false; + // check: all the offsets in a level should be ascending(no same items + // allows). + if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { + if (a < b) return true; + return false; + })) { + LOG(INFO) << "ascending error"; + return false; + } + } + // check: the lowest level's last offset should equals `tensor_height` if + // tensor_height>0. + if (tensor_height > 0 && (size_t)tensor_height != in.back().back()) + return false; + + // check: the higher level's last offset should equals the lower level's + // size-1. + // NOTE LoD store the levels from top to bottom, so the higher level goes + // first. + for (size_t level = 0; level < in.size() - 1; level++) { + if (in[level].back() != in[level + 1].size() - 1) return false; + } + return true; +} + +bool CheckAbsLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: all the offsets in a level should be ascending(no same items + // allows). + if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { + if (a < b) return true; + return false; + })) { + return false; + } + + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + + // check: the first offset of each level should be 0, and the last should be + // the same(the height of underlying tensor). + if (level.front() != 0) return false; + if (tensor_height < 0) { + tensor_height = level.back(); + } else if ((size_t)tensor_height != level.back()) { + return false; + } + } + return true; +} + +using LoDAndOffset = std::pair>; +LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, + size_t end_idx, size_t start_level) { + LoD sub_lod; + + for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) { + PADDLE_ENFORCE_LE(start_idx, end_idx); + PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size()); + std::vector level_lens; + for (size_t i = start_idx; i < end_idx; ++i) { + level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]); + } + sub_lod.emplace_back(level_lens); + start_idx = lod[level_idx][start_idx]; + end_idx = lod[level_idx][end_idx]; + } + + return LoDAndOffset{sub_lod, {start_idx, end_idx}}; +} + +void AppendLoD(LoD *lod, const LoD &lod_length) { + PADDLE_ENFORCE( + lod->empty() || lod->size() == lod_length.size(), + "The lod_length should has the same size with the appended lod."); + if (lod->empty()) { + for (size_t i = 0; i < lod_length.size(); ++i) { + lod->emplace_back(1, 0); // size = 1, value = 0; + } + *lod = LoD(lod_length.size(), std::vector({0})); + } + for (size_t i = 0; i < lod->size(); ++i) { + auto &level = (*lod)[i]; + for (size_t len : lod_length[i]) { + level.push_back(level.back() + len); + } + } +} + +void SerializeToStream(std::ostream &os, const LoDTensor &tensor, + const platform::DeviceContext &dev_ctx) { + { // the 1st field, uint32_t version for LoDTensor + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... + auto lod = tensor.lod(); + uint64_t size = lod.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + + for (auto &each : lod) { + size = each.size() * sizeof(framework::LoD::value_type::value_type); + os.write(reinterpret_cast(&size), sizeof(size)); + os.write(reinterpret_cast(each.data()), + static_cast(size)); + } + } + // the 3st field, Tensor + SerializeToStream(os, static_cast(tensor), dev_ctx); +} + +void DeserializeFromStream(std::istream &is, LoDTensor *tensor, + const platform::DeviceContext &dev_ctx) { + { + // the 1st field, unit32_t version for LoDTensor + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, LoD information + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } + // the 3st filed, Tensor + DeserializeFromStream(is, static_cast(tensor), dev_ctx); +} + +std::vector LoDTensor::SplitLoDTensor( + const std::vector places) const { + check_memory_size(); + int batch_size = + lod().empty() ? dims()[0] : static_cast(lod()[0].size()) - 1; + size_t result_size = std::min(static_cast(batch_size), places.size()); + size_t remainder = batch_size % places.size(); + + std::vector results; + results.reserve(result_size); + + int step_width = static_cast(batch_size / result_size); + for (size_t i = 0; i < result_size; ++i) { + int begin = static_cast(i * step_width); + int end = static_cast((i + 1) * step_width); + if (i + 1 == places.size()) { // last + end += remainder; + } + + LoDTensor dst; + if (lod().empty()) { + auto src = Slice(begin, end); + auto &dst_place = places[i]; + framework::Copy(src, dst_place, &dst); + } else { + auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0); + + auto &offset = lod_and_offset.second; + auto src = Slice(offset.first, offset.second); + auto &dst_place = places[i]; + framework::Copy(src, dst_place, &dst); + + LoD my_lod; + for (auto &l : lod_and_offset.first) { + std::vector v{0}; + for (auto &ll : l) { + v.push_back(ll + v.back()); + } + my_lod.emplace_back(v); + } + dst.set_lod(my_lod); + } + results.emplace_back(dst); + } + + return results; +} + +void LoDTensor::MergeLoDTensor( + const std::vector &lod_tensors, + platform::Place dst_place) { + PADDLE_ENFORCE(!lod_tensors.empty()); + + framework::DDim new_dim = lod_tensors[0]->dims(); + std::type_index new_type = lod_tensors[0]->type(); + framework::DataLayout new_layout = lod_tensors[0]->layout(); + LoD new_lod = lod_tensors[0]->lod(); + for (size_t i = 1; i < lod_tensors.size(); ++i) { + auto *t = lod_tensors[i]; + PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code()); + PADDLE_ENFORCE_EQ(new_layout, t->layout()); + + PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], + framework::product(t->dims()) / t->dims()[0]); + new_dim[0] += t->dims()[0]; + + auto &lod = t->lod(); + for (size_t j = 0; j < lod.size(); ++j) { + auto &sub_lod = new_lod[j]; + auto &offset = sub_lod.back(); + for (size_t k = 1; k < lod[j].size(); ++k) { + sub_lod.push_back(lod[j][k] + offset); + } + } + } + Resize(new_dim); + set_layout(new_layout); + set_lod(new_lod); + mutable_data(dst_place, new_type); + + int begin = 0; + for (auto *src : lod_tensors) { + int end = begin + src->dims()[0]; + auto dst = Slice(begin, end); + framework::Copy(*src, dst_place, &dst); + begin = end; + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..1509a9fb1347659f7526c6892f632feb8c84579c --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.h @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +/* + * LoD is short for Level of Details. + * + * - in a level, each element indicates relative offset of the lower level + * - the first element should be 0 and that indicates that this sequence start + * from 0 + * - each sequence's begin and end(no-inclusive) is level[id, id+1] + * + * For example: + * 3-level LoD stores + * + * 0 2 3 + * 0 2 4 7 + * 0 2 5 7 10 12 15 20 + */ +using LoD = std::vector>; + +std::ostream& operator<<(std::ostream& os, const LoD& lod); +std::ostream& operator<<(std::ostream& os, const LoDTensor& t); + +std::string LoDToString(const LoD& lod); + +LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, + size_t elem_end); +/* + * Transform an LoD from relative offsets to absolute offsets. + */ +LoD ToAbsOffset(const LoD& in); + +bool operator==(const LoD& a, const LoD& b); + +/* + * Check whether this lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * + * 1. all the offsets in a level should be ascending(no same items allows). + * 2. there should be more than 2 offsets existing in each level. + * 3. the higher level's last offset should equals the lower level's size-1. + * 4. the first offset(the begin offset) of each level should be 0. + * 5. the lowest level's last offset should equals `tensor_height` if + * tensor_height>0. + */ + +bool CheckLoD(const LoD& in, int tensor_height = -1); +/* + * Check whether this absolute lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * 1. all the offsets in a level should be ascending(no same items allows) + * 2. there should be more than 2 offsets existing in each level. + * 3. the first offset of each level should be 0, and the last should be the + * same(the height of underlying tensor) or `tensor_height` if + * tensor_height>0. + */ +bool CheckAbsLoD(const LoD& in, int tensor_height = -1); + +/* + * LoDTensor (Level of details Tensor) + * see https://en.wikipedia.org/wiki/Level_of_details for reference. + */ +class LoDTensor : public Tensor { + public: + LoDTensor() : Tensor() {} + + /* Constructor with place should only be used in pybind */ + explicit LoDTensor(const platform::Place& place) : Tensor(place) {} + + explicit LoDTensor(const LoD& lod) : lod_(lod) {} + + void set_lod(const LoD& lod) { lod_ = lod; } + + const LoD& lod() const { return lod_; } + + LoD* mutable_lod() { return &lod_; } + + /* + * Get the start offset and end offset of an element from LoD. + */ + std::pair lod_element(size_t level, size_t elem) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + PADDLE_ENFORCE_LT(elem, NumElements(level)); + return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]); + } + + /* + * Number of LoDTensor's levels, each level has units of data, for example, + * in the sentence's view, article, paragraph, sentence are 3 levels. + */ + size_t NumLevels() const { return lod_.size(); } + /* + * Number of elements in a level. + */ + size_t NumElements(size_t level = 0) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + // the last offset is the end of last element + return (lod_)[level].size() - 1; + } + + std::vector SplitLoDTensor( + const std::vector places) const; + + void MergeLoDTensor(const std::vector& lod_tensors, + platform::Place place); + + private: + LoD lod_; +}; + +/* + * Expand the `source` to fit the LoD of `lod`. For example, a `source` + * LoDTensor is + * - LoD: [0, 2] + * - tensor: [a0, a1] + * a `lod` is + * - LoD: [0 3 5] + * returns a new LoDTensor + * - [a0 a0 a0 a1 a1] + */ +template +LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, + const platform::Place& place) { + LoD abs_lod = ToAbsOffset(lod); + const auto& lod_level = lod[level]; + size_t num_instances = source.dims()[0]; + + // new tensor + LoDTensor tensor; + tensor.set_lod(lod); + auto dims = source.dims(); + dims[0] = lod_level.back(); + tensor.Resize(dims); + tensor.mutable_data(place); + + PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1); + for (size_t ins = 0; ins < num_instances; ins++) { + for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { + auto slice = tensor.Slice(elem, elem + 1); + Copy(source.Slice(ins, ins + 1), platform::CPUPlace(), + platform::CPUDeviceContext(), &slice); + } + } + return tensor; +} + +// Get the absolute offset of a lod[start_level][start_idx:end_idx] and +// relative length of details for every levels(i.e., [start_level: ]). +// +// For example, +// lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] +// start_level = 0 +// start_idx = 1 +// end_idx = 3 +// +// Returns: +// LoD = [[1, 4], [2, 4, 2, 3, 2]] +// pair = {11, 24} +std::pair> GetSubLoDAndAbsoluteOffset( + const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); + +void AppendLoD(LoD* lod, const LoD& lod_length); + +/* + * Serialize/Desiralize LoDTensor to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const LoDTensor& tensor, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, LoDTensor* tensor, + const platform::DeviceContext& dev_ctx); + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/lod_tensor.md b/paddle/fluid/framework/lod_tensor.md similarity index 100% rename from paddle/framework/lod_tensor.md rename to paddle/fluid/framework/lod_tensor.md diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h new file mode 100644 index 0000000000000000000000000000000000000000..652513bd22597000e8249eb19776182d850793aa --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +using LoDTensorArray = std::vector; +} +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e0ed2495d68a8cc0d377aaf5b5103aea1064688 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -0,0 +1,228 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" + +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +TEST(LoD, data) { + LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i); + } +} + +TEST(LodExpand, test) { + LoD lod{{0, 2}}; + LoDTensor tensor; + tensor.set_lod(lod); + tensor.Resize({2, 1}); + tensor.mutable_data(platform::CPUPlace()); + tensor.data()[0] = 0; + tensor.data()[1] = 1; + + LoD target; + target.emplace_back(std::vector{0, 3, 5}); + auto new_tensor = LodExpand(tensor, target, 0UL, platform::CPUPlace()); + std::vector result{{0, 0, 0, 1, 1}}; + for (size_t i = 0; i < 5; i++) { + ASSERT_EQ(new_tensor.data()[i], result[i]); + } +} + +TEST(LoD, GetFineGrainedLoDLength) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5})); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + lod.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29})); + + auto lod_and_offset = + paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0); + LoD lod_length = lod_and_offset.first; + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + + LoD expected; + expected.push_back(std::vector{2}); + expected.push_back(std::vector{2, 2}); + expected.push_back(std::vector{2, 3, 4, 2}); + EXPECT_EQ(lod_length, expected); + EXPECT_EQ(start_offset, 15UL); + EXPECT_EQ(end_offset, 26UL); +} + +TEST(LoD, AppendLoD) { + LoD lod_lens; + lod_lens.push_back(std::vector({2})); + lod_lens.push_back(std::vector({2, 2})); + lod_lens.push_back(std::vector({2, 3, 4, 2})); + + LoD origin; + origin.push_back(std::vector({0, 2})); + origin.push_back(std::vector({0, 1, 6})); + origin.push_back(std::vector({0, 2, 5, 7, 10, 12, 15})); + + paddle::framework::AppendLoD(&origin, lod_lens); + + LoD expected; + expected.push_back(std::vector({0, 2, 4})); + expected.push_back(std::vector({0, 1, 6, 8, 10})); + expected.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26})); + EXPECT_EQ(origin, expected); +} + +TEST(LoD, ToAbsOffset) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod); + + LoD expected; + expected.push_back(std::vector({0, 5})); + expected.push_back(std::vector({0, 2, 5})); + expected.push_back(std::vector({0, 2, 4, 5})); + + EXPECT_EQ(abs_lod, expected); +} + +TEST(LoD, SplitLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + LoDTensor lod_tensor; + lod_tensor.Resize({20, 1}); + float* dst_ptr = lod_tensor.mutable_data(place); + for (int i = 0; i < lod_tensor.numel(); ++i) { + dst_ptr[i] = i; + } + lod_tensor.set_lod(lod); + + std::vector places{platform::CPUPlace(), + platform::CPUPlace()}; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + + auto lods = lod_tensor.SplitLoDTensor(places); + EXPECT_EQ(lods[0].lod(), lod0); + EXPECT_EQ(lods[1].lod(), lod1); +} + +TEST(LoD, MergeLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + + LoDTensor lod_tensor0; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + lod_tensor0.set_lod(lod0); + + lod_tensor0.Resize({13, 1}); + float* dst_ptr = lod_tensor0.mutable_data(place); + for (int i = 0; i < lod_tensor0.numel(); ++i) { + dst_ptr[i] = i; + } + + LoDTensor lod_tensor1; + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + lod_tensor1.set_lod(lod1); + lod_tensor1.Resize({7, 1}); + dst_ptr = lod_tensor1.mutable_data(place); + for (int i = 0; i < lod_tensor1.numel(); ++i) { + dst_ptr[i] = i; + } + + std::vector lods{&lod_tensor0, &lod_tensor1}; + + LoDTensor lod_tensor; + lod_tensor.MergeLoDTensor(lods, place); + EXPECT_EQ(lod_tensor.lod(), lod); +} + +TEST(LoD, CheckLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + // check compatible + ASSERT_TRUE(CheckLoD(relative_lod)); + relative_lod[1].back()++; + ASSERT_FALSE(CheckLoD(relative_lod)); + relative_lod[1].back()--; // recover it + + // check empty + LoD empty_lod; + ASSERT_TRUE(CheckLoD(empty_lod)); + + // check less than 2 offsets in a level + LoD some_lod0; + some_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckLoD(some_lod0)); + + // check with underlying tensor storage. + ASSERT_TRUE(CheckLoD(relative_lod, 5)); + ASSERT_FALSE(CheckLoD(relative_lod, 9)); +} + +TEST(LoD, CheckAbsLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + auto abs_lod = ToAbsOffset(relative_lod); + + ASSERT_TRUE(CheckAbsLoD(abs_lod)); + + // check less than 2 offsets in a level. + + // check the last item should be compatible with tensor height. + abs_lod.back().back()++; + ASSERT_FALSE(CheckAbsLoD(abs_lod)); + abs_lod.back().back()--; // restore + + // check less than 2 offsets in a lod. + LoD abs_lod0; + abs_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckAbsLoD(abs_lod0)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..4dd7810c1b25cbfeb7d6d79034a97db3f1d67ebb --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/place.h" + +__global__ void test(size_t* a, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; + i += blockDim.x * gridDim.x) { + a[i] *= 2; + } +} + +TEST(LoD, data) { + paddle::framework::InitDevices(); + + paddle::framework::LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + paddle::platform::CUDAPlace gpu(0); + test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); + cudaDeviceSynchronize(); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i * 2); + } +} + +TEST(LoDTensor, LoDInGPU) { + paddle::framework::InitDevices(); + + paddle::framework::LoDTensor lod_tensor; + paddle::platform::CUDAPlace place(0); + + paddle::framework::LoD src_lod; + src_lod.push_back(std::vector{0, 2, 4, 6, 8, 10, 12, 14}); + + lod_tensor.Resize({14, 16}); + lod_tensor.mutable_data(place); + + lod_tensor.set_lod(src_lod); + EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL); + EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL); + + auto lod = lod_tensor.lod(); + + test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); + cudaDeviceSynchronize(); + + for (size_t i = 0; i < src_lod[0].size(); ++i) { + EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); + } +} diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..9756754260d46519d181f95e000f39ba92d22ef0 --- /dev/null +++ b/paddle/fluid/framework/mixed_vector.h @@ -0,0 +1,363 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "glog/logging.h" + +namespace paddle { +namespace framework { + +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. +template +class Vector { + public: + using value_type = T; + + // Default ctor. Create empty Vector + Vector() { InitEmpty(); } + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T& value = T()) { + if (count == 0) { + InitEmpty(); + } else { + resize(count); + T* ptr = begin(); + for (size_t i = 0; i < count; ++i) { + ptr[i] = value; + } + } + } + + // Ctor with init_list + Vector(std::initializer_list init) { + if (init.size() == 0) { + InitEmpty(); + } else { + InitByIter(init.size(), init.begin(), init.end()); + } + } + + // implicit cast from std::vector. + template + Vector(const std::vector& dat) { // NOLINT + if (dat.size() == 0) { + InitEmpty(); + } else { + InitByIter(dat.size(), dat.begin(), dat.end()); + } + } + + // Copy ctor + Vector(const Vector& other) { this->operator=(other); } + + // Copy operator + Vector& operator=(const Vector& other) { + if (other.size() != 0) { + this->InitByIter(other.size(), other.begin(), other.end()); + } else { + InitEmpty(); + } + return *this; + } + + // Move ctor + Vector(Vector&& other) { + this->size_ = other.size_; + this->flag_ = other.flag_; + if (other.cuda_vec_.memory_size()) { + this->cuda_vec_.ShareDataWith(other.cuda_vec_); + } + if (other.cpu_vec_.memory_size()) { + this->cpu_vec_.ShareDataWith(other.cpu_vec_); + } + } + + // CPU data access method. Mutable. + T& operator[](size_t i) { + MutableCPU(); + return const_cast(cpu_vec_.data())[i]; + } + + // CPU data access method. Immutable. + const T& operator[](size_t i) const { + ImmutableCPU(); + return cpu_vec_.data()[i]; + } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return size_; } + + T* begin() { return &this->operator[](0); } + + T* end() { return &this->operator[](size()); } + + T& front() { return *begin(); } + + T& back() { + auto it = end(); + --it; + return *it; + } + + const T* begin() const { return &this->operator[](0); } + const T* end() const { return &this->operator[](size()); } + + const T& back() const { + auto it = end(); + --it; + return *it; + } + + T* data() { return begin(); } + + const T* data() const { return begin(); } + + const T& front() const { return *begin(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + InitByIter(end - begin, begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + if (size_ + 1 > capacity()) { + reserve((size_ + 1) << 1); + } + *end() = elem; + ++size_; + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + size_t pre_size = size_; + resize(pre_size + (end - begin)); + T* ptr = this->begin() + pre_size; + for (; begin < end; ++begin, ++ptr) { + *ptr = *begin; + } + } + + // resize the vector + void resize(size_t size) { + if (size + 1 < capacity()) { + size_ = size; + } else { + MutableCPU(); + Tensor cpu_tensor; + platform::Place cpu = platform::CPUPlace(); + T* ptr = cpu_tensor.mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + const T* old_ptr = + cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + size_, ptr); + } + size_ = size; + cpu_vec_.ShareDataWith(cpu_tensor); + } + } + + // get cuda ptr. immutable + const T* CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return cuda_vec_.data(); + } + + // get cuda ptr. mutable + T* CUDAMutableData(platform::Place place) { + const T* ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + size_ = 0; + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { + return cpu_vec_.memory_size() / SizeOfType(typeid(T)); + } + + // reserve data + void reserve(size_t size) { + size_t pre_size = size_; + resize(size); + resize(pre_size); + } + + // the unify method to access CPU or CUDA data. immutable. + const T* Data(platform::Place place) const { + if (platform::is_gpu_place(place)) { + return CUDAData(place); + } else { + return data(); + } + } + + // the unify method to access CPU or CUDA data. mutable. + T* MutableData(platform::Place place) { + if (platform::is_gpu_place(place)) { + return CUDAMutableData(place); + } else { + return data(); + } + } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + std::vector result; + result.resize(size()); + std::copy(begin(), end(), result.begin()); + return result; + } + + bool operator==(const Vector& other) const { + if (size() != other.size()) return false; + for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + return true; + } + + private: + void InitEmpty() { + size_ = 0; + flag_ = kDataInCPU; + } + + template + void InitByIter(size_t size, Iter begin, Iter end) { + platform::Place cpu = platform::CPUPlace(); + T* ptr = this->cpu_vec_.template mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + for (size_t i = 0; i < size; ++i) { + *ptr++ = *begin++; + } + flag_ = kDataInCPU | kDirty; + size_ = size; + } + + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); + WaitPlace(cuda_vec_.place()); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == cuda_vec_.place())) { + framework::Tensor tmp; + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + cuda_vec_.ShareDataWith(tmp); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + SetFlag(kDataInCUDA); + } else if (!(place == cuda_vec_.place())) { + framework::Tensor tmp; + WaitPlace(cuda_vec_.place()); + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + WaitPlace(place); + cuda_vec_.ShareDataWith(tmp); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void ImmutableCPU() const { + if (IsDirty() && + !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + static void WaitPlace(const platform::Place place) { + if (platform::is_gpu_place(place)) { + platform::DeviceContextPool::Instance() + .Get(boost::get(place)) + ->Wait(); + } + } + + mutable int flag_; + mutable Tensor cpu_vec_; + mutable Tensor cuda_vec_; + size_t size_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..a89064525661af71b22f18f835fd7b111956847b --- /dev/null +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/platform/gpu_info.h" + +template +using vec = paddle::framework::Vector; + +TEST(mixed_vector, CPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + vec tmp2; + tmp2 = tmp; + ASSERT_EQ(tmp2.size(), 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp2[i], i); + ASSERT_EQ(tmp2[i], tmp[i]); + } + int cnt = 0; + for (auto& t : tmp2) { + ASSERT_EQ(t, cnt); + ++cnt; + } +} + +static __global__ void multiply_10(int* ptr) { + for (int i = 0; i < 10; ++i) { + ptr[i] *= 10; + } +} + +cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { + return reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); +} + +TEST(mixed_vector, GPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu(0); + + multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); + + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 10); + } +} + +TEST(mixed_vector, MultiGPU) { + if (paddle::platform::GetCUDADeviceCount() < 2) { + LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " + "GPUs in your machine."; + return; + } + + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu0(0); + paddle::platform::SetDeviceId(0); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + paddle::platform::CUDAPlace gpu1(1); + auto* gpu1_ptr = tmp.MutableData(gpu1); + paddle::platform::SetDeviceId(1); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 100); + } +} diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..cbc15e60b83397ed8420bc7a4cd716ef15979554 --- /dev/null +++ b/paddle/fluid/framework/op_desc.cc @@ -0,0 +1,521 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_desc.h" +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/shape_inference.h" + +namespace paddle { +namespace framework { + +class OpDesc; +class BlockDesc; +class CompileTimeInferShapeContext : public InferShapeContext { + public: + CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); + + bool HasInput(const std::string &name) const override; + + bool HasOutput(const std::string &name) const override; + + bool HasInputs(const std::string &name) const override; + + bool HasOutputs(const std::string &name) const override; + + AttrReader Attrs() const override; + + const std::vector &Inputs( + const std::string &name) const override; + + const std::vector &Outputs( + const std::string &name) const override; + + void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); + auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); + if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) { + VLOG(3) << "input " << in << " is not LodTensor"; + return; + } + PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR, + "The %d-th output of Output(%s) must be LoDTensor.", j, + out); + out_var->SetLoDLevel(in_var->GetLoDLevel()); + } + + bool IsRuntime() const override; + + protected: + proto::VarDesc::VarType GetVarType(const std::string &name) const override; + + DDim GetDim(const std::string &name) const override; + + void SetDim(const std::string &name, const DDim &dim) override; + + std::vector GetRepeatedDims(const std::string &name) const override; + + void SetRepeatedDims(const std::string &name, + const std::vector &dims) override; + + InferShapeVarPtr GetVarPtr(const std::string &name) override; + + const OpDesc &op_; + const BlockDesc &block_; +}; + +OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) { + desc_.set_type(type); + inputs_ = inputs; + outputs_ = outputs; + attrs_ = attrs; + need_update_ = true; +} + +void OpDesc::CopyFrom(const OpDesc &op_desc) { + desc_.set_type(op_desc.Type()); + inputs_ = op_desc.inputs_; + outputs_ = op_desc.outputs_; + attrs_ = op_desc.attrs_; + need_update_ = true; +} + +OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block) + : desc_(desc), need_update_(false) { + // restore inputs_ + int input_size = desc_.inputs_size(); + for (int i = 0; i < input_size; ++i) { + const proto::OpDesc::Var &var = desc_.inputs(i); + std::vector &args = inputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore outputs_ + int output_size = desc_.outputs_size(); + for (int i = 0; i < output_size; ++i) { + const proto::OpDesc::Var &var = desc_.outputs(i); + std::vector &args = outputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore attrs_ + for (const proto::OpDesc::Attr &attr : desc_.attrs()) { + std::string attr_name = attr.name(); + // The sub_block referred to by the BLOCK attr hasn't been added + // to ProgramDesc class yet, we skip setting BLOCK attr here. + if (attr.type() != proto::AttrType::BLOCK) { + attrs_[attr_name] = GetAttrValue(attr); + } + } + this->block_ = block; +} + +proto::OpDesc *OpDesc::Proto() { + Flush(); + return &desc_; +} + +const std::vector &OpDesc::Input(const std::string &name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, + Type()); + return it->second; +} + +std::vector OpDesc::InputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->inputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetInput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + inputs_[param_name] = args; +} + +const std::vector &OpDesc::Output(const std::string &name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", + name, Type()); + return it->second; +} + +std::vector OpDesc::OutputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->outputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetOutput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + this->outputs_[param_name] = args; +} + +proto::AttrType OpDesc::GetAttrType(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return static_cast(it->second.which() - 1); +} + +std::vector OpDesc::AttrNames() const { + std::vector retv; + retv.reserve(attrs_.size()); + for (auto &attr : attrs_) { + retv.push_back(attr.first); + } + return retv; +} + +void OpDesc::SetAttr(const std::string &name, const Attribute &v) { + this->attrs_[name] = v; + need_update_ = true; +} + +void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) { + this->attrs_[name] = █ + need_update_ = true; +} + +void OpDesc::SetAttrMap( + const std::unordered_map &attr_map) { + attrs_ = attr_map; + need_update_ = true; +} + +Attribute OpDesc::GetAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return it->second; +} + +int OpDesc::GetBlockAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return boost::get(it->second)->ID(); +} + +const std::unordered_map &OpDesc::GetAttrMap() const { + return attrs_; +} + +void OpDesc::Rename(const std::string &old_name, const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDesc::RenameOutput(const std::string &old_name, + const std::string &new_name) { + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + need_update_ = true; +} + +void OpDesc::RenameInput(const std::string &old_name, + const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + need_update_ = true; +} + +struct SetAttrDescVisitor : public boost::static_visitor { + explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {} + mutable proto::OpDesc::Attr *attr_; + void operator()(int v) const { attr_->set_i(v); } + void operator()(float v) const { attr_->set_f(v); } + void operator()(const std::string &v) const { attr_->set_s(v); } + + // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162 + template ::value>::type> + void operator()(T b) const { + attr_->set_b(b); + } + + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_ints()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_floats()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_strings()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_bools()); + } + void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } + void operator()(int64_t v) const { attr_->set_l(v); } + void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } +}; + +void OpDesc::Flush() { + if (need_update_) { + this->desc_.mutable_inputs()->Clear(); + for (auto &ipt : inputs_) { + auto *input = desc_.add_inputs(); + input->set_parameter(ipt.first); + VectorToRepeated(ipt.second, input->mutable_arguments()); + } + + this->desc_.mutable_outputs()->Clear(); + for (auto &opt : outputs_) { + auto *output = desc_.add_outputs(); + output->set_parameter(opt.first); + VectorToRepeated(opt.second, output->mutable_arguments()); + } + + this->desc_.mutable_attrs()->Clear(); + for (auto &attr : attrs_) { + auto *attr_desc = desc_.add_attrs(); + attr_desc->set_name(attr.first); + attr_desc->set_type( + static_cast(attr.second.which() - 1)); + SetAttrDescVisitor visitor(attr_desc); + boost::apply_visitor(visitor, attr.second); + } + + need_update_ = false; + } +} + +static std::once_flag init_infer_shape_funcs; + +static void InitInferShapeFuncs() { + std::call_once(init_infer_shape_funcs, [] { + auto &map = OpInfoMap::Instance(); + auto &info_map = *map.mutable_map(); + + for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { + auto op_type = kern_pair.first; + auto &op_info = info_map.at(op_type); + auto op = static_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); + if (op_info.infer_shape_) { // infer_shape has been registered. + continue; + } + op_info.infer_shape_ = [op](InferShapeContext *ctx) { + op->InferShape(ctx); + }; + } + }); +} + +void OpDesc::CheckAttrs() { + PADDLE_ENFORCE(!Type().empty(), + "CheckAttr() can not be called before type is setted."); + auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); + if (checker == nullptr) { + // checker is not configured. That operator could be generated by Paddle, + // not by users. + return; + } + checker->Check(attrs_); +} + +void OpDesc::InferShape(const BlockDesc &block) const { + VLOG(3) << "CompileTime infer shape on " << Type(); + InitInferShapeFuncs(); + auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; + PADDLE_ENFORCE(static_cast(infer_shape), + "%s's infer_shape has not been registered", this->Type()); + CompileTimeInferShapeContext ctx(*this, block); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + auto inames = this->InputArgumentNames(); + sout << " From ["; + std::copy(inames.begin(), inames.end(), + std::ostream_iterator(sout, ", ")); + sout << "] to ["; + auto onames = this->OutputArgumentNames(); + std::copy(onames.begin(), onames.end(), + std::ostream_iterator(sout, ", ")); + sout << "]"; + VLOG(10) << sout.str(); + } + infer_shape(&ctx); +} + +void OpDesc::InferVarType(BlockDesc *block) const { + auto &info = OpInfoMap::Instance().Get(this->Type()); + if (info.infer_var_type_) { + info.infer_var_type_(*this, block); + } else { + // all output type is LoDTensor by default + VLOG(10) << this->Type() + << " has not registered InferVarType. Set output variables to " + "LOD_TENSOR"; + for (auto &out_pair : this->outputs_) { + for (auto &out_var_name : out_pair.second) { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(proto::VarDesc::LOD_TENSOR); + } + } + } +} + +CompileTimeInferShapeContext::CompileTimeInferShapeContext( + const OpDesc &op, const BlockDesc &block) + : op_(op), block_(block) {} + +bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + auto length = input_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(input_names[0]); +} + +bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + auto length = output_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(output_names[0]); +} + +bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + if (input_names.empty()) { + return false; + } + for (auto &input : input_names) { + if (!block_.HasVarRecursive(input)) return false; + } + return true; +} + +bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + if (output_names.empty()) { + return false; + } + for (auto &output : output_names) { + if (!block_.HasVarRecursive(output)) return false; + } + return true; +} + +AttrReader CompileTimeInferShapeContext::Attrs() const { + return AttrReader(op_.GetAttrMap()); +} + +const std::vector &CompileTimeInferShapeContext::Inputs( + const std::string &name) const { + return op_.Input(name); +} + +const std::vector &CompileTimeInferShapeContext::Outputs( + const std::string &name) const { + return op_.Output(name); +} + +DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + DDim res; + try { + auto shape = var->GetShape(); + res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } + return res; +} + +std::vector CompileTimeInferShapeContext::GetRepeatedDims( + const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector res; + try { + auto shapes = var->GetShapes(); + for (const auto &s : shapes) { + res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); + } + } catch (...) { + VLOG(5) << "GetRepeatedDim of variable " << name << " error."; + std::rethrow_exception(std::current_exception()); + } + return res; +} + +void CompileTimeInferShapeContext::SetDim(const std::string &name, + const DDim &dim) { + block_.FindVarRecursive(name)->SetShape(vectorize(dim)); +} + +void CompileTimeInferShapeContext::SetRepeatedDims( + const std::string &name, const std::vector &dims) { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector> dim_vec(dims.size()); + std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize); + var->SetShapes(dim_vec); +} + +bool CompileTimeInferShapeContext::IsRuntime() const { return false; } + +proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType( + const std::string &name) const { + return block_.FindVarRecursive(name)->GetType(); +} + +InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr( + const std::string &name) { + return block_.FindVarRecursive(name); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..698df829e56e1182e742db926a712497ee2b6966 --- /dev/null +++ b/paddle/fluid/framework/op_desc.h @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { + +class BlockDesc; +class ProgramDesc; +class OpDesc { + public: + OpDesc() {} + + OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs); + + OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block); + + explicit OpDesc(BlockDesc *block) : block_(block) {} + + OpDesc(const OpDesc &other, BlockDesc *block) { + *this = other; + block_ = block; + } + + void CopyFrom(const OpDesc &op_desc); + + proto::OpDesc *Proto(); + + std::string Type() const { return desc_.type(); } + + void SetType(const std::string &type) { desc_.set_type(type); } + + const std::vector &Input(const std::string &name) const; + + std::vector InputArgumentNames() const; + + void SetInput(const std::string ¶m_name, + const std::vector &args); + + const std::vector &Output(const std::string &name) const; + + std::vector OutputArgumentNames() const; + + void SetOutput(const std::string ¶m_name, + const std::vector &args); + + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + proto::AttrType GetAttrType(const std::string &name) const; + + std::vector AttrNames() const; + + void SetAttr(const std::string &name, const Attribute &v); + + void SetBlockAttr(const std::string &name, BlockDesc &block); + + Attribute GetAttr(const std::string &name) const; + + int GetBlockAttr(const std::string &name) const; + + void Rename(const std::string &old_name, const std::string &new_name); + + void RenameOutput(const std::string &old_name, const std::string &new_name); + + void RenameInput(const std::string &old_name, const std::string &new_name); + + // Only be used in C++ + const AttributeMap &GetAttrMap() const; + + // Only be used in C++ + void SetAttrMap(const AttributeMap &attr_map); + + std::vector InputNames() const { return MapKeys(inputs_); } + std::vector OutputNames() const { return MapKeys(outputs_); } + + void SetInputMap(const VariableNameMap &input) { + this->inputs_ = input; + this->need_update_ = true; + } + + void SetOutputMap(const VariableNameMap &output) { + this->outputs_ = output; + this->need_update_ = true; + } + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + AttributeMap *MutableAttrMap() { + this->need_update_ = true; + return &this->attrs_; + } + + void CheckAttrs(); + + void InferShape(const BlockDesc &block) const; + + void InferVarType(BlockDesc *block) const; + + void MarkAsTarget() { desc_.set_is_target(true); } + + void Flush(); + + BlockDesc *Block() { return this->block_; } + + void SetBlock(BlockDesc *block) { this->block_ = block; } + + private: + template + static std::vector MapKeys(const MapType &map) { + std::vector ret_val; + ret_val.reserve(map.size()); + std::transform( + map.begin(), map.end(), std::back_inserter(ret_val), + [](const typename MapType::value_type &pair) { return pair.first; }); + return ret_val; + } + + proto::OpDesc desc_; + BlockDesc *block_; // not_own + // input arg name => input variable names + VariableNameMap inputs_; + // output arg name => output variable names + VariableNameMap outputs_; + AttributeMap attrs_; + + // need_update_ indicate there some local changes not be synchronized. If + // local changes should be synchronized, need_update_ should be set to true. + bool need_update_{false}; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..703c9c3234b62e80c3f768ddb892584c1c0070c0 --- /dev/null +++ b/paddle/fluid/framework/op_info.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_info.h" + +namespace paddle { +namespace framework { + +static OpInfoMap* g_op_info_map = nullptr; + +OpInfoMap& OpInfoMap::Instance() { + if (g_op_info_map == nullptr) { + g_op_info_map = new OpInfoMap(); + } + return *g_op_info_map; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h new file mode 100644 index 0000000000000000000000000000000000000000..e6b3ff9e653196b9234e02131f37d5964c4f6e84 --- /dev/null +++ b/paddle/fluid/framework/op_info.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class InferShapeBase { + public: + virtual ~InferShapeBase() = default; + virtual void operator()(InferShapeContext*) const = 0; +}; + +struct OpInfo { + OpCreator creator_; + GradOpMakerFN grad_op_maker_; + proto::OpProto* proto_{nullptr}; + OpAttrChecker* checker_{nullptr}; + InferVarTypeFN infer_var_type_; + InferShapeFN infer_shape_; + + bool HasOpProtoAndChecker() const { + return proto_ != nullptr && checker_ != nullptr; + } + + const proto::OpProto& Proto() const { + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); + PADDLE_ENFORCE(proto_->IsInitialized(), + "Operator Proto must be initialized in op info"); + return *proto_; + } + + const OpCreator& Creator() const { + PADDLE_ENFORCE_NOT_NULL(creator_, + "Operator Creator has not been registered"); + return creator_; + } + + const GradOpMakerFN& GradOpMaker() const { + PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, + "Operator GradOpMaker has not been registered."); + return grad_op_maker_; + } + + const OpAttrChecker* Checker() const { return checker_; } +}; + +class OpInfoMap { + public: + static OpInfoMap& Instance(); + + bool Has(const std::string& op_type) const { + return map_.find(op_type) != map_.end(); + } + + void Insert(const std::string& type, const OpInfo& info) { + PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); + map_.insert({type, info}); + } + + const OpInfo& Get(const std::string& type) const { + auto op_info_ptr = GetNullable(type); + PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered", + type); + return *op_info_ptr; + } + + const OpInfo* GetNullable(const std::string& type) const { + auto it = map_.find(type); + if (it == map_.end()) { + return nullptr; + } else { + return &it->second; + } + } + + const std::unordered_map& map() const { return map_; } + + std::unordered_map* mutable_map() { return &map_; } + + private: + OpInfoMap() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(OpInfoMap); +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h new file mode 100644 index 0000000000000000000000000000000000000000..b5dbff26d7edc212a270d4d187dbb868068790c9 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/library_type.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +struct OpKernelType { + struct Hash { + size_t operator()(const OpKernelType& key) const { + int place = key.place_.which(); + int data_type = static_cast(key.data_type_) << LEFT_SHIFT; + int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); + int library_type = static_cast(key.library_type_) + << (LEFT_SHIFT * 3); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type); + } + }; + + // place, data_type, library_type kinds less than 2^8 + constexpr static int LEFT_SHIFT = 8; + + proto::DataType data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + + OpKernelType(proto::DataType data_type, platform::Place place, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(place), + library_type_(library_type) {} + + OpKernelType(proto::DataType data_type, + const platform::DeviceContext& dev_ctx, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(dev_ctx.GetPlace()), + library_type_(library_type) {} + + bool operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_; + } + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } +}; + +inline std::ostream& operator<<(std::ostream& os, + const OpKernelType& kernel_key) { + os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" + << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ + << "]:library_type[" << kernel_key.library_type_ << "]"; + return os; +} + +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + +inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { + return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r; +} + +inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { + return (!platform::places_are_same_class(l.place_, r.place_)) || + (l.data_type_ != r.data_type_) || + NeedTransformLayout(l.data_layout_, r.data_layout_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..64096907df5a52904525ef0bf25bb9527c3a8c4b --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" +#include +#include + +TEST(OpKernelType, ToString) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), + "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type[" + "CUDNN]"); +} + +TEST(OpKernelType, Hash) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using CUDAPlace = paddle::platform::CUDAPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + + OpKernelType::Hash hasher; + ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); +} diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a779b10b49ab35dd0dbe25ac3f2bccd34fb654e --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { + +void OpProtoAndCheckerMaker::Validate() { + validated_ = true; + CheckNoDuplicatedInOutAttrs(); +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( + const std::string& name, const std::string& comment) { + auto* input = proto_->add_inputs(); + input->set_name(name); + input->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{input}; +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( + const std::string& name, const std::string& comment) { + auto* output = proto_->add_outputs(); + output->set_name(name); + output->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{output}; +} + +void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { + std::unordered_set names; + auto checker = [&](const std::string& name) { + PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name); + names.insert(name); + }; + for (auto& attr : proto_->attrs()) { + checker(attr.name()); + } + for (auto& input : proto_->inputs()) { + checker(input.name()); + } + for (auto& output : proto_->outputs()) { + checker(output.name()); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h new file mode 100644 index 0000000000000000000000000000000000000000..1dbfc7d37be6ae79fde39434b12355a54ee648f6 --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { + +// this class not only make proto but also init attribute checkers. +class OpProtoAndCheckerMaker { + public: + using OpProto = proto::OpProto; + using OpAttrChecker = framework::OpAttrChecker; + OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : proto_(proto), op_checker_(op_checker) {} + + virtual ~OpProtoAndCheckerMaker() { + PADDLE_ENFORCE(validated_, "should call Validate after build"); + } + + void Validate(); + + protected: + struct VariableBuilder { + OpProto::Var* var_; + + VariableBuilder& AsDuplicable() { + var_->set_duplicable(true); + return *this; + } + + VariableBuilder& AsIntermediate() { + var_->set_intermediate(true); + return *this; + } + + VariableBuilder& AsDispensable() { + var_->set_dispensable(true); + return *this; + } + }; + + VariableBuilder AddInput(const std::string& name, const std::string& comment); + + VariableBuilder AddOutput(const std::string& name, + const std::string& comment); + + template + TypedAttrChecker& AddAttr(const std::string& name, + const std::string& comment, + bool generated = false) { + auto* attr = proto_->add_attrs(); + attr->set_name(name); + attr->set_comment(comment); + attr->set_generated(generated); + attr->set_type(AttrTypeID()); + return op_checker_->AddAttrChecker(name); + } + + void AddComment(const std::string& comment) { proto_->set_comment(comment); } + + private: + void CheckNoDuplicatedInOutAttrs(); + + OpProto* proto_; + OpAttrChecker* op_checker_; + bool validated_{false}; +}; + +class NOPMaker : public OpProtoAndCheckerMaker { + public: + NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) {} +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfefee8dbdead9dd0074d954fe7318baae57e8c4 --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" + +#include "gtest/gtest.h" + +class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + TestAttrProtoMaker(paddle::framework::proto::OpProto* proto, + paddle::framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("scale", "scale of test op"); + AddAttr("scale", "scale of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedAttr) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} + +class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + TestInOutProtoMaker(paddle::framework::proto::OpProto* proto, + paddle::framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of test op"); + AddInput("input", "input of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedInOut) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker); + ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..739ec72ebc17e31ab207b0e2260d7f563ceaca6e --- /dev/null +++ b/paddle/fluid/framework/op_registry.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +#include + +namespace paddle { +namespace framework { + +std::unique_ptr OpRegistry::CreateOp( + const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, AttributeMap attrs) { + auto& info = OpInfoMap::Instance().Get(type); + if (info.Checker() != nullptr) { + info.Checker()->Check(attrs); + } + auto op = info.Creator()(type, inputs, outputs, attrs); + return std::unique_ptr(op); +} + +static VariableNameMap ConvertOpDescVarsToVarNameMap( + const google::protobuf::RepeatedPtrField& + op_desc_vars) { + VariableNameMap ret_val; + for (auto& var : op_desc_vars) { + auto& var_names = ret_val[var.parameter()]; + auto& var_names_in_proto = var.arguments(); + var_names.reserve(static_cast(var_names_in_proto.size())); + std::copy(var_names_in_proto.begin(), var_names_in_proto.end(), + std::back_inserter(var_names)); + } + return ret_val; +} + +std::unique_ptr OpRegistry::CreateOp( + const proto::OpDesc& op_desc) { + VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" + "used in unit tests. Use CreateOp(const OpDesc& op_desc) " + "instead."; + VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); + VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); + AttributeMap attrs; + for (auto& attr : op_desc.attrs()) { + attrs[attr.name()] = GetAttrValue(attr); + } + + return CreateOp(op_desc.type(), inputs, outputs, attrs); +} + +std::unique_ptr OpRegistry::CreateOp(const OpDesc& op_desc) { + return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(), + op_desc.GetAttrMap()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..73faa99668ad58ddb66de515eb4750883f58bcf5 --- /dev/null +++ b/paddle/fluid/framework/op_registry.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" // For VLOG() +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/details/op_registry.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/shape_inference.h" + +namespace paddle { +namespace framework { +class Registrar { + public: + // In our design, various kinds of classes, e.g., operators and kernels, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_OP macros to + // call this method. So, as long as the callee code calls USE_OP, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; + +template +struct OperatorRegistrar : public Registrar { + explicit OperatorRegistrar(const char* op_type) { + PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), + "'%s' is registered more than once.", op_type); + static_assert(sizeof...(ARGS) != 0, + "OperatorRegistrar should be invoked at least by OpClass"); + OpInfo info; + details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info); + OpInfoMap::Instance().Insert(op_type, info); + } +}; + +class OpRegistry { + public: + static std::unique_ptr CreateOp(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + AttributeMap attrs); + + static std::unique_ptr CreateOp(const proto::OpDesc& op_desc); + + static std::unique_ptr CreateOp(const OpDesc& op_desc); +}; + +template +struct OpKernelRegistrarFunctor; + +template +struct OpKernelRegistrarFunctor { + using KERNEL_TYPE = + typename std::tuple_element>::type; + + void operator()(const char* op_type, const char* library_type) const { + using T = typename KERNEL_TYPE::ELEMENT_TYPE; + OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), + DataLayout::kAnyLayout, StringToLibraryType(library_type)); + OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); + + constexpr auto size = std::tuple_size>::value; + OpKernelRegistrarFunctor + func; + func(op_type, library_type); + } +}; + +template +struct OpKernelRegistrarFunctor { + void operator()(const char* op_type, const char* library_type) const {} +}; + +// User can register many kernel in one place. The data type could be different. +template +class OpKernelRegistrar : public Registrar { + public: + explicit OpKernelRegistrar(const char* op_type, const char* library_type) { + OpKernelRegistrarFunctor func; + func(op_type, library_type); + } +}; + +/** + * check if MACRO is used in GLOBAL NAMESPACE. + */ +#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +/* + The variadic arguments should be class types derived from one of the + following classes: + OpProtoAndCheckerMaker + GradOpDescMakerBase + VarTypeInference + InferShapeBase +*/ +#define REGISTER_OPERATOR(op_type, op_class, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op__##op_type, \ + "REGISTER_OPERATOR must be called in global namespace"); \ + class _OpClass_##op_type##_ : public op_class { \ + public: \ + DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_); \ + DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class); \ + }; \ + static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \ + ##__VA_ARGS__> \ + __op_registrar_##op_type##__(#op_type); \ + int TouchOpRegistrar_##op_type() { \ + __op_registrar_##op_type##__.Touch(); \ + return 0; \ + } + +/** + * Macro to register Operator. When the input is duplicable, you should + * use REGISTER_OP_EX with drop_empty_grad=false instead. + */ +#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class) \ + REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, true) + +// When an argument is duplicable, we need to use this version. +// Perhaps we can omit DropEmptyIG template parameter and +// only have one version of REGISTER_OP. +#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \ + grad_op_class, drop_empty_grad) \ + REGISTER_OPERATOR(grad_op_type, grad_op_class); \ + class _GradOpDescMaker_##grad_op_type##_ \ + : public ::paddle::framework::DefaultGradOpDescMaker { \ + using ::paddle::framework::DefaultGradOpDescMaker< \ + drop_empty_grad>::DefaultGradOpDescMaker; \ + \ + protected: \ + virtual std::string GradOpType() const { return #grad_op_type; } \ + }; \ + REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ + op_maker_class); + +#define REGISTER_OP_WITH_KERNEL(op_type, ...) \ + REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \ + ##__VA_ARGS__) + +#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \ + REGISTER_OPERATOR(op_type, op_class, op_maker_class) + +/** + * Macro to register OperatorKernel. + */ +#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + "REGISTER_OP_KERNEL must be called in global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \ + #LIBRARY_TYPE); \ + int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \ + __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \ + return 0; \ + } + +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) + +#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) + +/** + * Macro to mark what Operator and Kernel + * we will use and tell the compiler to + * link them into target. + */ +#define USE_OP_ITSELF(op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_itself_##op_type, \ + "USE_OP_ITSELF must be called in global namespace"); \ + extern int TouchOpRegistrar_##op_type(); \ + static int use_op_itself_##op_type##_ __attribute__((unused)) = \ + TouchOpRegistrar_##op_type() + +#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ + "USE_OP_DEVICE_KERNEL must be in global namespace"); \ + extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \ + static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ \ + __attribute__((unused)) = \ + TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() + +// TODO(fengjiayi): The following macros +// seems ugly, do we have better method? + +#ifndef PADDLE_WITH_CUDA +#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) +#else +#define USE_OP_KERNEL(op_type) \ + USE_OP_DEVICE_KERNEL(op_type, CPU); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) +#endif + +#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); + +#define USE_CPU_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CPU); + +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) + +#define USE_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_KERNEL(op_type) + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bfbb2cfc2c57c705cf42c65825edcc6dea08cf41 --- /dev/null +++ b/paddle/fluid/framework/op_registry_test.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace pd = paddle::framework; + +namespace paddle { +namespace framework { + +class CosineOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} +}; + +class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of cosine op"); + AddOutput("output", "output of cosine op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is cos op"); + } +}; + +class MyTestOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} +}; + +class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of cosine op").AsDuplicable(); + AddOutput("output", "output of cosine op").AsIntermediate(); + auto my_checker = [](int i) { + PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!"); + }; + AddAttr("test_attr", "a simple test attribute") + .AddCustomChecker(my_checker); + AddComment("This is my_test op"); + } +}; +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + var->add_arguments(arg_name); + } +} +REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp, + paddle::framework::CosineOpProtoAndCheckerMaker); +REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp, + paddle::framework::MyTestOpProtoAndCheckerMaker); + +TEST(OpRegistry, CreateOp) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + float scale = 3.3; + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(scale); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + float scale_get = op->Attr("scale"); + ASSERT_EQ(scale_get, scale); +} + +TEST(OpRegistry, IllegalAttr) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(-2.0); + + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "larger_than check fail"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(OpRegistry, DefaultValue) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + ASSERT_TRUE(op_desc.IsInitialized()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + ASSERT_EQ(op->Attr("scale"), 1.0); +} + +TEST(OpRegistry, CustomChecker) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("my_test_op"); + BuildVar("input", {"ii"}, op_desc.add_inputs()); + BuildVar("output", {"oo"}, op_desc.add_outputs()); + + // attr 'test_attr' is not set + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "Attribute 'test_attr' is required!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to an illegal value + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(3); + caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "'test_attr' must be even!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to a legal value + op_desc.mutable_attrs()->Clear(); + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(4); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + op->Run(scope, cpu_place); + int test_attr = op->Attr("test_attr"); + ASSERT_EQ(test_attr, 4); +} + +class CosineOpComplete : public paddle::framework::CosineOp { + public: + DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp); + DEFINE_OP_CLONE_METHOD(CosineOpComplete); +}; + +TEST(OperatorRegistrar, Test) { + using namespace paddle::framework; + OperatorRegistrar reg("cos"); +} + +namespace paddle { +namespace framework { + +class OpKernelTestMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::DataType::FP32, ctx.device_context()); + } +}; + +template +class OpKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const {} +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel, + paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_CPU_KERNEL( + op_with_kernel, + paddle::framework::OpKernelTest); + +REGISTER_OP_CUDA_KERNEL(op_with_kernel, + paddle::framework::OpKernelTest< + paddle::platform::CUDADeviceContext, float>); + +TEST(OperatorRegistrar, CPU) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cpu_place); +} + +TEST(OperatorRegistrar, CUDA) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cuda_place); +} + +static int op_test_value = 0; + +using paddle::platform::DeviceContext; +using paddle::platform::CPUDeviceContext; +using paddle::platform::CUDADeviceContext; + +namespace paddle { +namespace framework { + +class OpWithMultiKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout, + framework::LibraryType::kCUDNN); + } +}; + +template +class OpMultiKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + ++op_test_value; + } +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + --op_test_value; + } +}; + +template +class OpMultiKernelTest2 : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value += 10; + } +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value -= 10; + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel, + paddle::framework::OpWithMultiKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CPU, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest2); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest2); + +TEST(OperatorRegistrar, OpWithMultiKernel) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_multi_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + // TODO(qiao) add priority back + // use all available kernels + op->Run(scope, cuda_place); + EXPECT_EQ(op_test_value, -10); +} diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..61529fe38b15fe2a4bfa0d64159994d6b62fb086 --- /dev/null +++ b/paddle/fluid/framework/operator.cc @@ -0,0 +1,601 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include + +#include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_type.h" + +DECLARE_bool(benchmark); + +namespace paddle { +namespace framework { + +std::vector> kKernelPriority = { + std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), + std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), + std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN), + std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), +}; + +static DDim GetDims(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return DDim({-1}); + } + + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + return DDim({-1}); + } +} + +static LoD GetLoD(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + auto default_lod = LoD({{}}); + + if (var == nullptr) { + return default_lod; + } + + if (var->IsType()) { + return var->Get().lod(); + } else { + return default_lod; + } +} + +std::string OperatorBase::Input(const std::string& name) const { + auto& ins = Inputs(name); + PADDLE_ENFORCE_LE(ins.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + type_, name); + return ins.empty() ? kEmptyVarName : ins[0]; +} + +const std::vector& OperatorBase::Inputs( + const std::string& name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.", + type_, name); + return it->second; +} + +std::string OperatorBase::Output(const std::string& name) const { + auto& outs = Outputs(name); + PADDLE_ENFORCE_LE(outs.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + type_, name); + return outs.empty() ? kEmptyVarName : outs[0]; +} + +const std::vector& OperatorBase::Outputs( + const std::string& name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), + "Operator %s does not have an output called %s.", type_, name); + return it->second; +} + +std::string OperatorBase::DebugStringEx(const Scope* scope) const { + std::stringstream ss; + ss << "Op(" << type_ << "), inputs:{"; + for (auto it = inputs_.begin(); it != inputs_.end();) { + auto& input = *it; + ss << input.first << "["; + for (size_t i = 0; i < input.second.size(); ++i) { + ss << input.second[i]; + if (scope) { + ss << "[" << GetDims(*scope, input.second[i]) << "]"; + ss << "(" << GetLoD(*scope, input.second[i]) << ")"; + } + if (i != input.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != inputs_.end()) { + ss << ", "; + } + } + ss << "}, outputs:{"; + for (auto it = outputs_.begin(); it != outputs_.end();) { + auto& output = *it; + ss << output.first << "["; + for (size_t i = 0; i < output.second.size(); ++i) { + ss << output.second[i]; + if (scope) { + ss << "[" << GetDims(*scope, output.second[i]) << "]"; + ss << "(" << GetLoD(*scope, output.second[i]) << ")"; + } + if (i != output.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != outputs_.end()) { + ss << ", "; + } + } + ss << "}."; + return ss.str(); +} + +void OperatorBase::Rename(const std::string& old_name, + const std::string& new_name) { + for (auto& input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + for (auto& output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } +} + +OperatorBase::OperatorBase(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) + : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) { + GenerateTemporaryNames(); + CheckAllInputOutputSet(); +} + +std::vector OperatorBase::InputVars() const { + std::vector ret_val; + for (auto& o : inputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; +} + +std::vector OperatorBase::OutputVars(bool has_intermediate) const { + std::vector ret_val; + if (has_intermediate) { + // push all outputs into ret_val + for (auto& o : outputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; + } + auto& info = OpInfoMap::Instance().Get(Type()); + + // get all OpProto::Var for outputs + for (auto& o : info.Proto().outputs()) { + // ignore all intermediate output + if (o.intermediate()) continue; + auto out = outputs_.find(o.name()); + if (out != outputs_.end()) { + ret_val.reserve(ret_val.size() + out->second.size()); + ret_val.insert(ret_val.end(), out->second.begin(), out->second.end()); + } + } + return ret_val; +} + +void OperatorBase::CheckAllInputOutputSet() const { + auto& info_map = OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) return; + + for (auto& in : op_info->Proto().inputs()) { + PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), + "Type %s's input %s is not set", Type(), in.name()); + } + + for (auto& out : op_info->Proto().outputs()) { + PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), + "Type %s's output %s is not set", Type(), out.name()); + } +} + +void OperatorBase::GenerateTemporaryNames() { + static std::atomic gUniqId(0UL); + for (auto& output : outputs_) { + for (auto& output_name : output.second) { + if (output_name == kTempVarName) { + output_name += type_; + output_name += "@"; + output_name += std::to_string(gUniqId.fetch_add(1)); + } + } + } +} + +static bool VarIsTensor(const Variable* var) { + return var->IsType() || var->IsType(); +} + +static const Tensor* GetTensorFromVar(Variable* var) { + if (var->IsType()) { + return var->GetMutable(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + var->Type().name()); + } +} + +static Tensor* GetMutableTensorFromVar(Variable* var) { + if (var->IsType()) { + return var->GetMutable(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + var->Type().name()); + } +} + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const { + auto* var = InputVar(name); + return var == nullptr ? nullptr + : GetTensorFromVar(const_cast(var)); +} + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const { + auto names = op().Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : GetTensorFromVar(var); + }); + return res; +} + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const { + auto var = OutputVar(name); + return var == nullptr ? nullptr : GetMutableTensorFromVar(var); +} + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const { + auto names = op().Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr + : GetMutableTensorFromVar(var); + }); + return res; +} + +bool OpSupportGPU(const std::string& op_type) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it == all_kernels.end()) { + // All control operator must support GPU + + return true; + } + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + return false; +} + +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) + : op_(op), scope_(scope) {} + + bool HasInput(const std::string& name) const override { + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto ipt = ins[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; + } + + bool HasOutput(const std::string& name) const override { + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto ipt = outs[0]; + auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + return var != nullptr; + } + + bool HasInputs(const std::string& name) const override { + auto inputs = op_.Inputs(name); + if (inputs.empty()) { + return false; + } + for (auto& input : inputs) { + if (scope_.FindVar(input) == nullptr) { + return false; + } + } + return true; + } + + bool HasOutputs(const std::string& name) const override { + auto outputs = op_.Outputs(name); + if (outputs.empty()) { + return false; + } + for (auto& output : outputs) { + if (scope_.FindVar(output) == nullptr) { + return false; + } + } + return true; + } + + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + const std::vector& Inputs( + const std::string& name) const override { + return op_.Inputs(name); + } + + const std::vector& Outputs( + const std::string& name) const override { + return op_.Outputs(name); + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + + // TODO(dzhwinter) : reuse ShareLoD in most operators. + // Need to call ShareLayout explicitly in sequence related ops. + // Shall we have a better method to shared info between in/out Tensor? + out_tensor->set_layout(in_tensor.layout()); + } + + void ShareLayout(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + Variable* in_var = scope_.FindVar(Inputs(in)[i]); + Variable* out_var = scope_.FindVar(Outputs(out)[j]); + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_layout(in_tensor.layout()); + } + + bool IsRuntime() const override { return true; } + + protected: + DDim GetDim(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + std::vector GetRepeatedDims(const std::string& name) const override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + return var->Get().shapes(); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + void SetDim(const std::string& name, const DDim& dim) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", + name, var->Type().name()); + } + } + + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + Variable* var = scope_.FindVar(name); + if (var->IsType()) { + var->GetMutable()->set_shapes(dims); + } else { + PADDLE_THROW( + "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's " + "type_id is %s.", + name, var->Type().name()); + } + } + + proto::VarDesc::VarType GetVarType(const std::string& name) const override { + auto* var = scope_.FindVar(name); + return ToVarType(var->Type()); + } + + InferShapeVarPtr GetVarPtr(const std::string& name) override { + return scope_.FindVar(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; +}; + +void OperatorWithKernel::Run(const Scope& scope, + const platform::Place& place) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } + + ExecutionContext ctx(*this, scope, *dev_ctx); + + OpKernelMap& kernels = kernels_iter->second; + + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. + + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = this->GetExpectedKernelType(ctx); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } + + // do data transform + Scope& new_scope = scope.NewScope(); + + for (auto& var_name_item : this->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope.FindVar(var_name); + if (var && VarIsTensor(var)) { + auto* tensor_in = GetTensorFromVar(var); + if (tensor_in->IsInitialized()) { + auto kernel_type_for_var = this->GetKernelTypeForVar( + var_name_item.first, *tensor_in, expected_kernel_key); + if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) { + auto out_var_names = OutputVars(true); + if (std::find(out_var_names.begin(), out_var_names.end(), + var_name) != out_var_names.end()) { + PADDLE_THROW( + "var %s is both input and output, " + "does not support transform", + var_name); + } + VLOG(3) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; + auto* trans_var = new_scope.Var(var_name); + std::shared_ptr out(new Tensor); + DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in, + out.get()); + CopyVariableWithTensor(*var, *(out.get()), *trans_var); + } + } + } + } + } + + auto* new_dev_ctx = pool.Get(expected_kernel_key.place_); + kernel_iter->second->Compute( + ExecutionContext(*this, new_scope, *new_dev_ctx)); + + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + new_dev_ctx->Wait(); + } +} + +proto::DataType OperatorWithKernel::IndicateDataType( + const ExecutionContext& ctx) const { + auto& scope = ctx.scope(); + int data_type = -1; + for (auto& input : this->inputs_) { + for (auto& ipt_name : input.second) { + auto* var = scope.FindVar(ipt_name); + if (var != nullptr) { + const Tensor* t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &(var->Get().value()); + } + if (t != nullptr) { + int tmp = static_cast(ToDataType(t->type())); + PADDLE_ENFORCE(tmp == data_type || data_type == -1, + "DataType of Paddle Op %s must be the same.", Type()); + data_type = tmp; + } + } + } + } + PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input"); + return static_cast(data_type); +} + +OpKernelType OperatorWithKernel::GetExpectedKernelType( + const ExecutionContext& ctx) const { + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); +} + +OpKernelType OperatorWithKernel::GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const { + return OpKernelType(expected_kernel_type.data_type_, tensor.place()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h new file mode 100644 index 0000000000000000000000000000000000000000..52300abeb7df346d610d2363335dc9d3330ee39e --- /dev/null +++ b/paddle/fluid/framework/operator.h @@ -0,0 +1,401 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" // For VLOG +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/utils/Error.h" + +namespace paddle { +namespace framework { + +/// If a variable is a empty variable, that name will be used. +constexpr char kEmptyVarName[] = "@EMPTY@"; + +/// If a variable is a temporary variable, that name will be set in Python, +/// but it will be convert to a unique name in scope after OpCreator. +constexpr char kTempVarName[] = "@TEMP@"; + +/// If a variable's name has a certain suffix, it means that the +/// variable is the gradient of another varibale. +/// e.g. Variable "x@GRAD" is the gradient of varibale "x". +constexpr char kGradVarSuffix[] = "@GRAD"; + +/// Variables with this suffix are supposed to be filled up with zeros. +constexpr char kZeroVarSuffix[] = "@ZERO"; + +// define some kernel priority +/* Define multiple kernel type fallback order*/ +extern std::vector> kKernelPriority; + +inline std::string GradVarName(const std::string& var_name) { + return var_name + kGradVarSuffix; +} + +class OperatorBase; +class ExecutionContext; + +/** + * OperatorBase has the basic element that Net will call to do computation. + * Only CreateOperator from OpRegistry will new Operator directly. User + * should always construct a proto message OpDesc and call + * OpRegistry::CreateOp(op_desc) to get an Operator instance. + */ +class OperatorBase { + public: + OperatorBase(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs); + + virtual ~OperatorBase() {} + + template + inline const T& Attr(const std::string& name) const { + PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", + name); + return boost::get(attrs_.at(name)); + } + + /// if scope is not null, also show dimensions of arguments + virtual std::string DebugStringEx(const Scope* scope) const; + + std::string DebugString() const { return DebugStringEx(nullptr); } + + /// Net will call this function to Run an op. + virtual void Run(const Scope& scope, const platform::Place& place) const = 0; + + // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. + virtual void Stop() {} + + virtual bool IsNetOp() const { return false; } + + virtual bool SupportGPU() const { return false; } + + /// rename inputs outputs name + void Rename(const std::string& old_name, const std::string& new_name); + + const VariableNameMap& Inputs() const { return inputs_; } + const VariableNameMap& Outputs() const { return outputs_; } + + //! Get a input with argument's name described in `op_proto` + std::string Input(const std::string& name) const; + //! Get a input which has multiple variables. + const std::vector& Inputs(const std::string& name) const; + + std::vector InputVars() const; + + //! Get a output with argument's name described in `op_proto` + std::string Output(const std::string& name) const; + //! Get an output which has multiple variables. + //! TODO add a vector_view to prevent memory copy. + const std::vector& Outputs(const std::string& name) const; + + virtual std::vector OutputVars(bool has_intermediate) const; + + const std::string& Type() const { return type_; } + void SetType(const std::string& type) { type_ = type; } + const AttributeMap& Attrs() const { return attrs_; } + + // Return a new operator instance, which is as same as this. + // Use unique_ptr to prevent caller forget to delete this pointer. + virtual std::unique_ptr Clone() const = 0; + + protected: + std::string type_; + // NOTE: in case of OpGrad, inputs_ contains: + // I (Inputs) + // O (Outputs) + // OG (Output Gradients) + VariableNameMap inputs_; + + // NOTE: in case of OpGrad, outputs_ contains + // IG (Inputs Gradients) + VariableNameMap outputs_; + AttributeMap attrs_; + + private: + void GenerateTemporaryNames(); + void CheckAllInputOutputSet() const; +}; + +// Macro for define a clone method. +// If you are writing an kernel operator, `Clone` will be defined when you +// register it. i.e. `Clone` method is not needed to define by yourself. +#define DEFINE_OP_CLONE_METHOD(cls) \ + std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final { \ + return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \ + } + +// Macro for define a default constructor for Operator. +// You can also use +// using PARENT_CLASS::PARENT_CLASS; +// to use parent's constructor. +#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ + cls(const std::string& type, \ + const ::paddle::framework::VariableNameMap& inputs, \ + const ::paddle::framework::VariableNameMap& outputs, \ + const paddle::framework::AttributeMap& attrs) \ + : parent_cls(type, inputs, outputs, attrs) {} + +class NOP : public OperatorBase { + public: + using OperatorBase::OperatorBase; + void Run(const Scope& scope, const platform::Place& place) const override {} + std::unique_ptr Clone() const override { + return std::unique_ptr(new NOP(*this)); + } +}; + +class ExecutionContext { + public: + ExecutionContext(const OperatorBase& op, const Scope& scope, + const platform::DeviceContext& device_context) + : op_(op), scope_(scope), device_context_(device_context) {} + + const OperatorBase& op() const { return op_; } + + const Scope& scope() const { return scope_; } + + template + inline const T& Attr(const std::string& name) const { + return op_.Attr(name); + } + + size_t InputSize(const std::string& name) const { + return op_.Inputs(name).size(); + } + + size_t OutputSize(const std::string& name) const { + return op_.Outputs(name).size(); + } + + const Variable* InputVar(const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); + } + + Variable* OutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); + } + + const std::vector MultiInputVar( + const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { + return name == kEmptyVarName ? nullptr + : scope_.FindVar(name); + }); + return res; + } + + std::vector MultiOutputVar(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [this](const std::string& name) { + return name == kEmptyVarName ? nullptr + : scope_.FindVar(name); + }); + return res; + } + + template + const T* Input(const std::string& name) const { + auto* var = InputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* Output(const std::string& name) const { + auto var = OutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + template + const std::vector MultiInput(const std::string& name) const { + auto names = op_.Inputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : &var->Get(); + }); + return res; + } + + template + std::vector MultiOutput(const std::string& name) const { + auto names = op_.Outputs(name); + std::vector res; + res.reserve(names.size()); + std::transform(names.begin(), names.end(), std::back_inserter(res), + [&](const std::string& sub_name) { + auto var = scope_.FindVar(sub_name); + return var == nullptr ? nullptr : var->GetMutable(); + }); + return res; + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const { + PADDLE_ENFORCE_LT(i, InputSize(in)); + PADDLE_ENFORCE_LT(j, OutputSize(out)); + auto* in_var = MultiInputVar(in)[i]; + auto* out_var = MultiOutputVar(out)[j]; + if (!in_var->IsType()) return; + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + } + + platform::Place GetPlace() const { return device_context_.GetPlace(); } + + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + + const platform::DeviceContext& device_context() const { + return device_context_; + } + +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + + //! Get actual name vector for this input. + const std::vector& Inputs(const std::string& name) const { + return op_.Inputs(name); + } + + //! Get actual name vector for this output. + const std::vector& Outputs(const std::string& name) const { + return op_.Outputs(name); + } + + private: + const OperatorBase& op_; + const Scope& scope_; + const platform::DeviceContext& device_context_; +}; + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const; + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const; + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const; + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const; + +class OpKernelBase { + public: + /** + * ExecutionContext is the only parameter of Kernel Run function. + * Run will get input/output variables, state such as momentum and + * device resource such as CUDA stream, cublas handle, etc. from + * ExecutionContext. User should construct it before run the Operator. + */ + + virtual void Compute(const ExecutionContext& context) const = 0; + + virtual ~OpKernelBase() = default; +}; + +template +class OpKernel : public OpKernelBase { + public: + using ELEMENT_TYPE = T; +}; + +class OperatorWithKernel : public OperatorBase { + public: + using OpKernelMap = + std::unordered_map, + OpKernelType::Hash>; + + OperatorWithKernel(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const Scope& scope, const platform::Place& place) const final; + + static std::unordered_map& + AllOpKernels() { + static std::unordered_map g_all_op_kernels; + return g_all_op_kernels; + } + + bool SupportGPU() const override { + auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); + return std::any_of(op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_gpu_place(kern_pair.first.place_); + }); + } + + virtual void InferShape(InferShapeContext* ctx) const { + OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); + } + + protected: + virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + virtual OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const; + + private: + // indicate kernel DataType by input data. Defaultly all input data must be + // same. + proto::DataType IndicateDataType(const ExecutionContext& ctx) const; +}; + +extern bool OpSupportGPU(const std::string& op_type); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b90f5538bb620275521cdc11bf47b4014b2a66e2 --- /dev/null +++ b/paddle/fluid/framework/operator_test.cc @@ -0,0 +1,273 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +static int op_run_num = 0; + +class OpWithoutKernelTest : public OperatorBase { + public: + OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs), x(1) {} + void Run(const Scope& scope, const platform::Place& place) const override { + ++op_run_num; + ASSERT_EQ(static_cast(inputs_.size()), 1); + ASSERT_EQ(static_cast(outputs_.size()), 1); + ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr); + ASSERT_EQ(x, 1); + ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr); + } + + public: + int x{0}; +}; + +class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("input", "input of test op"); + AddOutput("output", "output of test op"); + AddAttr("scale", "scale of cosine op"); + AddComment("This is test op"); + } +}; + +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +REGISTER_OP_WITHOUT_GRADIENT(test_operator, + paddle::framework::OpWithoutKernelTest, + paddle::framework::OpWithoutKernelCheckerMaker); + +TEST(OperatorBase, all) { + paddle::framework::InitDevices(); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("test_operator"); + BuildVar("input", {"IN1"}, op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + scope.Var("OUT1"); + ASSERT_EQ(paddle::framework::op_run_num, 0); + op->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::op_run_num, 1); +} + +namespace paddle { +namespace framework { + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("x", "input of test op"); + AddOutput("y", "output of test op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is test op"); + } +}; + +static int cpu_kernel_run_num = 0; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + return OpKernelType(proto::DataType::FP32, ctx.GetPlace()); + } +}; + +template +class CPUKernelTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + +class OpKernelTestMultiInputsProtoAndCheckerMaker + : public OpProtoAndCheckerMaker { + public: + OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("xs", "inputs of test op").AsDuplicable(); + AddInput("k", "input of test op"); + AddOutput("ys", "outputs of test op").AsDuplicable(); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is test op"); + } +}; + +class CPUKernalMultiInputsTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + auto xs = ctx.op().Inputs("xs"); + ASSERT_EQ(xs.size(), 3UL); + ASSERT_EQ(xs[0], "x0"); + ASSERT_EQ(xs[1], "x1"); + ASSERT_EQ(xs[2], "x2"); + + auto inVar0 = ctx.MultiInputVar("xs"); + ASSERT_EQ(inVar0.size(), 3U); + + auto intVar1 = ctx.InputVar("k"); + ASSERT_NE(intVar1, nullptr); + + auto outVar0 = ctx.MultiOutputVar("ys"); + ASSERT_EQ(outVar0.size(), 2U); + + auto inTensor0 = ctx.MultiInput("xs"); + ASSERT_EQ(inTensor0.size(), 3U); + + auto intTensor1 = ctx.Input("k"); + ASSERT_NE(intTensor1, nullptr); + + auto outTensor0 = ctx.MultiOutput("ys"); + ASSERT_EQ(outTensor0.size(), 2U); + + auto k = ctx.op().Input("k"); + ASSERT_EQ(k, "k0"); + + auto ys = ctx.op().Outputs("ys"); + ASSERT_EQ(ys.size(), 2UL); + ASSERT_EQ(ys[0], "y0"); + ASSERT_EQ(ys[1], "y1"); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + op_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_with_kernel, + paddle::framework::CPUKernelTest); + +// test with single input +TEST(OpKernel, all) { + paddle::framework::InitDevices(); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("op_with_kernel"); + BuildVar("x", {"IN1"}, op_desc.add_inputs()); + BuildVar("y", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); + op->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); +} + +REGISTER_OP_WITHOUT_GRADIENT( + op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, + paddle::framework::CPUKernalMultiInputsTest); + +// test with multi inputs +TEST(OpKernel, multi_inputs) { + using namespace paddle::framework; + + paddle::framework::InitDevices(); + proto::OpDesc op_desc; + + op_desc.set_type("op_multi_inputs_with_kernel"); + BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs()); + BuildVar("k", {"k0"}, op_desc.add_inputs()); + BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + scope.Var("x0")->GetMutable(); + scope.Var("x1")->GetMutable(); + scope.Var("x2")->GetMutable(); + scope.Var("k0")->GetMutable(); + scope.Var("y0")->GetMutable(); + scope.Var("y1")->GetMutable(); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + op->Run(scope, cpu_place); +} + +class OperatorClone : public paddle::framework::OperatorBase { + public: + DEFINE_OP_CLONE_METHOD(OperatorClone); + OperatorClone(const std::string& type, + const paddle::framework::VariableNameMap& inputs, + const paddle::framework::VariableNameMap& outputs, + const paddle::framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const paddle::framework::Scope& scope, + const paddle::platform::Place& place) const override {} +}; + +TEST(Operator, Clone) { + paddle::framework::InitDevices(); + OperatorClone a("ABC", paddle::framework::VariableNameMap{}, + paddle::framework::VariableNameMap{}, + paddle::framework::AttributeMap{}); + auto b = a.Clone(); + ASSERT_EQ(a.Type(), b->Type()); +} diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3f2e97cd954bd55ab1a8c9def6938c877a79449 --- /dev/null +++ b/paddle/fluid/framework/program_desc.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace framework { + +BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { + auto *b = desc_.add_blocks(); + b->set_parent_idx(parent.ID()); + b->set_idx(desc_.blocks_size() - 1); + blocks_.emplace_back(new BlockDesc(this, b)); + return blocks_.back().get(); +} + +proto::ProgramDesc *ProgramDesc::Proto() { + for (auto &block : blocks_) { + block->Flush(); + } + return &desc_; +} + +ProgramDesc::ProgramDesc() { + auto *block = desc_.mutable_blocks()->Add(); + block->set_idx(kRootBlockIndex); + block->set_parent_idx(kNoneBlockIndex); + blocks_.emplace_back(new BlockDesc(this, block)); +} + +ProgramDesc::ProgramDesc(const ProgramDesc &o) { + desc_ = o.desc_; + for (int i = 0; i < desc_.blocks_size(); ++i) { + auto *block = desc_.mutable_blocks(i); + blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); + } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } +} + +ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { + desc_ = desc; + for (auto &block_desc : *desc_.mutable_blocks()) { + blocks_.emplace_back(new BlockDesc(this, &block_desc)); + } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } +} + +ProgramDesc::ProgramDesc(const std::string &binary_str) { + PADDLE_ENFORCE(desc_.ParseFromString(binary_str), + "Fail to parse program_desc from binary string."); + for (auto &block_desc : *desc_.mutable_blocks()) { + blocks_.emplace_back(new BlockDesc(this, &block_desc)); + } +} + +const std::vector ProgramDesc::GetFeedTargetNames() { + BlockDesc *global_block = blocks_[0].get(); + std::vector feed_target_names; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]); + } + } + return feed_target_names; +} + +const std::vector ProgramDesc::GetFetchTargetNames() { + BlockDesc *global_block = blocks_[0].get(); + std::vector fetch_target_names; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_target_names.push_back(op->Input("X")[0]); + } + } + return fetch_target_names; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..937de6ba9270a275e5d4e020fe5f2e7f5ef63557 --- /dev/null +++ b/paddle/fluid/framework/program_desc.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class BlockDesc; + +class ProgramDesc { + public: + ProgramDesc(); + + explicit ProgramDesc(const proto::ProgramDesc &desc); + + ProgramDesc(const ProgramDesc &o); + + explicit ProgramDesc(const std::string &binary_str); + + BlockDesc *AppendBlock(const BlockDesc &parent); + + BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); } + + const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; } + + size_t Size() const { return blocks_.size(); } + + proto::ProgramDesc *Proto(); + + const std::vector GetFeedTargetNames(); + const std::vector GetFetchTargetNames(); + + private: + proto::ProgramDesc desc_; + + std::vector> blocks_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..afd5c9dabfbb0dab2832300dedc378ef617d8e81 --- /dev/null +++ b/paddle/fluid/framework/program_desc_test.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" + +namespace paddle { +namespace framework { +TEST(ProgramDesc, copy_ctor) { + ProgramDesc program; + auto* global_block = program.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarDesc_VarType_LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarDesc_VarType_LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarDesc_VarType_LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + ProgramDesc program_copy(program); + + auto* global_block_copy = program_copy.MutableBlock(0); + ASSERT_NE(global_block, global_block_copy); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_copy->HasVar(name)); + auto* copy = global_block_copy->Var(name); + ASSERT_NE(copy, var_before); + ASSERT_EQ(copy->Name(), var_before->Name()); + ASSERT_EQ(copy->GetType(), var_before->GetType()); + ASSERT_EQ(copy->GetShape(), var_before->GetShape()); + ASSERT_EQ(copy->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames()); + ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_copy = global_block->Op(i); + + ASSERT_EQ(op_origin->Type(), op_copy->Type()); + ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); + + ASSERT_EQ(op_copy->Proto()->SerializeAsString(), + op_origin->Proto()->SerializeAsString()); + } + + // Not check block's protostr are same it because the order of vars could be + // different and it is correct. +} + +TEST(ProgramDescBind, serialize_and_deserialize) { + ProgramDesc program_origin; + auto* global_block = program_origin.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarDesc_VarType_LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarDesc_VarType_LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarDesc_VarType_LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + std::string binary_str; + program_origin.Proto()->SerializeToString(&binary_str); + + ProgramDesc program_restored(binary_str); + auto* global_block_restored = program_restored.MutableBlock(0); + ASSERT_NE(global_block, global_block_restored); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_restored->HasVar(name)); + auto* restored = global_block_restored->Var(name); + ASSERT_NE(restored, var_before); + ASSERT_EQ(restored->Name(), var_before->Name()); + ASSERT_EQ(restored->GetType(), var_before->GetType()); + ASSERT_EQ(restored->GetShape(), var_before->GetShape()); + ASSERT_EQ(restored->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), + global_block_restored->LocalVarNames()); + ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_restored = global_block->Op(i); + + ASSERT_EQ(op_origin->Type(), op_restored->Type()); + ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs()); + + ASSERT_EQ(op_restored->Proto()->SerializeAsString(), + op_origin->Proto()->SerializeAsString()); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h similarity index 100% rename from paddle/framework/proto_desc.h rename to paddle/fluid/framework/proto_desc.h diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc new file mode 100644 index 0000000000000000000000000000000000000000..79dbd3bcab4124d3aa765c8ede174c9fb3de689b --- /dev/null +++ b/paddle/fluid/framework/prune.cc @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include +#include +#include +#include +#include + +#include + +namespace paddle { +namespace framework { + +const std::string kFeedOpType = "feed"; +const std::string kFetchOpType = "fetch"; +const std::string kDropOutOpType = "dropout"; +const std::string kBatchNormOpType = "batch_norm"; + +bool HasDependentVar(const proto::OpDesc& op_desc, + const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} + +bool IsTarget(const proto::OpDesc& op_desc) { + if (op_desc.has_is_target()) { + return op_desc.is_target(); + } + return false; +} + +int GetSubBlockIndex(const proto::OpDesc& op_desc) { + for (auto& attr : op_desc.attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + PADDLE_ENFORCE(attr.has_block_idx()); + return attr.block_idx(); + } + } + return -1; +} + +bool HasSubBlock(const proto::OpDesc& op_desc) { + return GetSubBlockIndex(op_desc) > 0; +} + +// block_id is the idx of the current block in the input desc +// parent_block_id is the idx of the parent of the current block +// in the output desc, -1 means the current block is global block +// dependent_vars is passed recursively from the parent block to +// the child block to help pruning +void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, + int block_id, int parent_block_id, + std::set& dependent_vars) { + auto& block = input.blocks(block_id); + auto& ops = block.ops(); + + bool expect_feed = true; + for (auto& op_desc : ops) { + PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed, + "All FeedOps are at the beginning of the ProgramDesc"); + expect_feed = (op_desc.type() == kFeedOpType); + } + + bool expect_fetch = true; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch, + "All FetchOps must at the end of the ProgramDesc"); + expect_fetch = (op_desc.type() == kFetchOpType); + } + + std::vector should_run; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) { + // insert its input to the dependency graph + for (auto& var : op_desc.inputs()) { + for (auto& argu : var.arguments()) { + dependent_vars.insert(argu); + } + } + should_run.push_back(true); + } else { + should_run.push_back(false); + } + } + + // since we are traversing the ProgramDesc in reverse order + // we reverse the should_run vector + std::reverse(should_run.begin(), should_run.end()); + + // copy the current block from input to output + auto* block_field = output->mutable_blocks(); + *block_field->Add() = input.blocks(block_id); + + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + output_block->set_idx(output_block_id); + output_block->set_parent_idx(parent_block_id); + + auto* op_field = output_block->mutable_ops(); + op_field->Clear(); + for (size_t i = 0; i < should_run.size(); ++i) { + if (should_run[i]) { + auto* op = op_field->Add(); + *op = input.blocks(block_id).ops(i); + if (HasSubBlock(*op)) { + // create sub_block_dependent_vars here to help prune the sub block + std::set sub_block_dependent_vars; + for (auto& var : op->inputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + for (auto& var : op->outputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc + // output_block_id is the idx of the current block in the output desc + prune_impl(input, output, GetSubBlockIndex(*op), output_block_id, + sub_block_dependent_vars); + } + } + } + + // remove the VarDescs in BlockDesc that are not referenced in + // the pruned OpDescs + std::unordered_map var_map; + auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars(); + for (const auto& var : *var_field) { + var_map[var.name()] = var; + } + + std::set var_names; + for (const auto& op : *op_field) { + auto& input_field = op.inputs(); + for (auto& input_var : input_field) { + for (auto& arg : input_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + auto& output_field = op.outputs(); + for (auto& output_var : output_field) { + for (auto& arg : output_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + } + + var_field->Clear(); + for (const auto& name : var_names) { + *var_field->Add() = var_map[name]; + } +} + +// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { + std::set dependent_vars; + output->clear_blocks(); + prune_impl(input, output, 0, -1, dependent_vars); +} + +void inference_optimize_impl(const proto::ProgramDesc& input, + proto::ProgramDesc* output, int block_id) { + *output = input; + auto* op_field = output->mutable_blocks(block_id)->mutable_ops(); + for (auto& op_desc : *op_field) { + if (op_desc.type() == kDropOutOpType || + op_desc.type() == kBatchNormOpType) { + for (auto& attr : *op_desc.mutable_attrs()) { + if (attr.name() == "is_test") { + attr.set_b(true); + break; + } + } + } + } +} + +void InferenceOptimize(const proto::ProgramDesc& input, + proto::ProgramDesc* output) { + inference_optimize_impl(input, output, 0); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h new file mode 100644 index 0000000000000000000000000000000000000000..601e66b67a77b615e43fe74e72935b1622e59965 --- /dev/null +++ b/paddle/fluid/framework/prune.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output); + +void InferenceOptimize(const proto::ProgramDesc& input, + proto::ProgramDesc* output); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..36b76f0763ec2bab861adf86b60093c5e3c4b9e2 --- /dev/null +++ b/paddle/fluid/framework/prune_test.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/net_op.h" + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" + +#include + +namespace f = paddle::framework; +namespace ops = paddle::operators; + +void AddOp(const std::string &type, const f::VariableNameMap &inputs, + const f::VariableNameMap &outputs, f::AttributeMap attrs, + paddle::framework::BlockDesc *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(paddle::framework::proto::DataType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +TEST(Prune, one_operator) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + f::proto::ProgramDesc pruned; + + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0); + + pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1); +} + +TEST(Prune, forward) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + + for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) { + f::proto::ProgramDesc pruned; + pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1); + } +} + +TEST(Prune, multi_input_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{}, + block); + AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, + f::AttributeMap{}, block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4); +} + +TEST(Prune, multi_output_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2); +} + +TEST(Prune, multi_target) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3); +} diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ef0c4821110a259fd20469e736b93f44a80f90a --- /dev/null +++ b/paddle/fluid/framework/reader.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace framework { + +DDim ReaderBase::shape(size_t idx) const { + PADDLE_ENFORCE_LT( + idx, shapes_.size(), + "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx, + shapes_.size()); + return shapes_[idx]; +} + +void ShuffleReader::ReadNext(std::vector* out) { + if (iteration_pos_ >= buffer_.size()) { + // Reload buffer with new data + buffer_.clear(); + buffer_.reserve(buffer_size_); + for (int i = 0; i < buffer_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be + // optimize. + std::random_shuffle(buffer_.begin(), buffer_.end()); + iteration_pos_ = 0; + } + out->clear(); + if (!buffer_.empty()) { + std::swap(*out, buffer_[iteration_pos_++]); + } + // if buffer_ is empty, the 'out' will return as an empty vector. +} + +void BatchReader::ReadNext(std::vector* out) { + buffer_.clear(); + buffer_.reserve(batch_size_); + for (int i = 0; i < batch_size_; ++i) { + if (reader_->HasNext()) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + } else { + break; + } + } + // Concat instances + out->clear(); + if (buffer_.empty()) { + // if buffer_ is empty, the 'out' will return as an empty vector. + return; + } + int out_num = buffer_[0].size(); + out->reserve(out_num); + for (int j = 0; j < out_num; ++j) { + // Merge shape and check date type + std::type_index batch_type = buffer_[0][j].type(); + DDim batch_shape = buffer_[0][j].dims(); + for (size_t i = 1; i < buffer_.size(); ++i) { + std::type_index ins_type = buffer_[i][j].type(); + DDim ins_shape = buffer_[i][j].dims(); + PADDLE_ENFORCE_EQ(batch_type, ins_type); + PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), + slice_ddim(ins_shape, 1, ins_shape.size())); + PADDLE_ENFORCE_GT(ins_shape[0], 0); + batch_shape[0] += ins_shape[0]; + } + + LoDTensor out_tensor; + out_tensor.Resize(batch_shape); + out_tensor.mutable_data(platform::CPUPlace(), batch_type); + int64_t dst_offset = 0; + + // Merge lod and data + LoD batch_lod; + for (size_t i = 0; i < buffer_.size(); ++i) { + DDim ins_shape = buffer_[i][j].dims(); + LoD ins_lod = buffer_[i][j].lod(); + if (i == 0) { + batch_lod = ins_lod; + } else { + PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); + for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { + auto& lod_level = batch_lod[level_idx]; + for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { + lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); + } + } + } + Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); + Copy(buffer_[i][j], platform::CPUPlace(), &dst); + dst_offset += ins_shape[0]; + } + out_tensor.set_lod(batch_lod); + out->push_back(out_tensor); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5eba5fb733b3e9da2b245b4dda18725c9b0895 --- /dev/null +++ b/paddle/fluid/framework/reader.h @@ -0,0 +1,161 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor_array.h" + +namespace paddle { +namespace framework { + +class ReaderBase { + public: + explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { + PADDLE_ENFORCE(!shapes_.empty()); + } + virtual void ReadNext(std::vector* out) = 0; + virtual bool HasNext() const = 0; + + virtual void ReInit() = 0; + + DDim shape(size_t idx) const; + std::vector shapes() const { return shapes_; } + void set_shapes(const std::vector& shapes) { shapes_ = shapes; } + + virtual ~ReaderBase() {} + + protected: + std::vector shapes_; +}; + +class FileReader : public ReaderBase { + public: + explicit FileReader(const std::vector& shapes) : ReaderBase(shapes) {} +}; + +class DecoratedReader : public ReaderBase { + public: + explicit DecoratedReader(ReaderBase* reader) + : ReaderBase(reader->shapes()), reader_(reader) { + PADDLE_ENFORCE_NOT_NULL(reader_); + } + + bool HasNext() const override { return reader_->HasNext(); } + + void ReInit() override { reader_->ReInit(); } + + protected: + ReaderBase* reader_; +}; + +// file readers + +template +class RandomDataGenerator : public FileReader { + public: + RandomDataGenerator(const std::vector& shapes, float min, float max) + : FileReader(shapes), min_(min), max_(max) { + PADDLE_ENFORCE_LE( + min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); + unsigned int seed = std::random_device()(); + engine_.seed(seed); + dist_ = std::uniform_real_distribution(min_, max_); + } + + void ReadNext(std::vector* out) override { + out->clear(); + out->reserve(shapes_.size()); + for (const DDim& shape : shapes_) { + PADDLE_ENFORCE_GE( + shape.size(), 2, + "The rank of reader's output data should be 2 at least.(Now it's %d)", + shape.size()); + LoDTensor out_tensor; + out_tensor.Resize(shape); + T* data = out_tensor.mutable_data(platform::CPUPlace()); + int64_t numel = product(shape); + for (int64_t i = 0; i < numel; ++i) { + data[i] = dist_(engine_); + } + out->push_back(out_tensor); + } + } + + bool HasNext() const override { return true; } + + void ReInit() override { return; } + + private: + float min_; + float max_; + std::minstd_rand engine_; + std::uniform_real_distribution dist_; +}; + +// decorated readers + +class ShuffleReader : public DecoratedReader { + public: + ShuffleReader(ReaderBase* reader, int buffer_size) + : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { + buffer_.reserve(buffer_size); + } + + void ReadNext(std::vector* out) override; + + private: + int buffer_size_; + std::vector> buffer_; + size_t iteration_pos_; +}; + +class BatchReader : public DecoratedReader { + public: + BatchReader(ReaderBase* reader, int batch_size) + : DecoratedReader(reader), batch_size_(batch_size) { + buffer_.reserve(batch_size_); + } + + void ReadNext(std::vector* out) override; + + private: + int batch_size_; + std::vector> buffer_; +}; + +// The ReaderHolder is used as readers' unified wrapper, +// making it easier to access different type readers in Variables. +class ReaderHolder { + public: + void Reset(ReaderBase* reader) { reader_.reset(reader); } + + ReaderBase* Get() const { return reader_.get(); } + + void ReadNext(std::vector* out) { reader_->ReadNext(out); } + bool HasNext() const { return reader_->HasNext(); } + void ReInit() { reader_->ReInit(); } + + DDim shape(size_t idx) const { return reader_->shape(idx); } + std::vector shapes() const { return reader_->shapes(); } + void set_shapes(const std::vector& shapes) { + reader_->set_shapes(shapes); + } + + private: + std::unique_ptr reader_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc new file mode 100644 index 0000000000000000000000000000000000000000..6006ed16bd4a9aece5772bad58dc75c8b0847206 --- /dev/null +++ b/paddle/fluid/framework/scope.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" + +#include // for unique_ptr +#include // for call_once +#include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/string/printf.h" + +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + +namespace paddle { +namespace framework { + +Scope::~Scope() { + DropKids(); + for (auto& kv : vars_) { + VLOG(3) << "Destroy variable " << kv.first; + delete kv.second; + } +} + +Scope& Scope::NewScope() const { + kids_.push_back(new Scope(this)); + return *kids_.back(); +} + +Variable* Scope::Var(const std::string& name) { + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + v = new Variable(); + vars_[name] = v; + VLOG(3) << "Create variable " << name; + v->name_ = &(vars_.find(name)->first); + return v; +} + +Variable* Scope::Var(std::string* name) { + auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + if (name != nullptr) { + *name = var_name; + } + return Var(var_name); +} + +Variable* Scope::FindVar(const std::string& name) const { + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); +} + +const Scope* Scope::FindScope(const Variable* var) const { + for (auto& kv : vars_) { + if (kv.second == var) { + return this; + } + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); +} +void Scope::DropKids() { + for (Scope* s : kids_) delete s; + kids_.clear(); +} + +std::vector Scope::LocalVarNames() const { + std::vector known_vars; + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } + return known_vars; +} + +void Scope::DeleteScope(Scope* scope) { + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); + this->kids_.erase(it); + // When making memory benchmark on Fluid, we have to delete scope sync. + if (FLAGS_benchmark) { + delete scope; + } else { + Async([scope] { delete scope; }); + } +} + +void Scope::Rename(const std::string& origin_name, + const std::string& new_name) const { + auto origin_it = vars_.find(origin_name); + PADDLE_ENFORCE(origin_it != vars_.end(), + "Cannot find original variable with name %s", origin_name); + auto new_it = vars_.find(new_name); + PADDLE_ENFORCE(new_it == vars_.end(), + "The variable with name %s is already in the scope", new_name); + vars_[new_name] = origin_it->second; + vars_.erase(origin_it); +} + +std::string Scope::Rename(const std::string& origin_name) const { + auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + Rename(origin_name, var_name); + return var_name; +} + +Variable* Scope::FindVarLocally(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) return it->second; + return nullptr; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h new file mode 100644 index 0000000000000000000000000000000000000000..2da9e0716e7c02dc8c5397e37746344dff8e429d --- /dev/null +++ b/paddle/fluid/framework/scope.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class Scope; + +/** + * @brief Scope that manage all variables. + * + * Scope is an association of a name to Variable. All variables belong to + * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. + * One net can run in different scopes and update different variable in the + * scope. + */ +class Scope { + public: + Scope() {} + ~Scope(); + + /// Create a sub-scope. Returns a reference other than a pointer so + /// to prevent from manual deletion. + /// Mark it to const because that new kid scope cannot change parent scope. + Scope& NewScope() const; + + /// Create a variable with given name if it doesn't exist. + Variable* Var(const std::string& name); + + /// Create a variable with a scope-unique name. + Variable* Var(std::string* name = nullptr); + + /// Find a variable in the scope or any of its ancestors. Returns + /// nullptr if cannot find. + Variable* FindVar(const std::string& name) const; + + const Scope& parent() const { return *parent_; } + + /// Find the scope or an ancestor scope that contains the given variable. + const Scope* FindScope(const Variable* var) const; + + void DeleteScope(Scope* scope); + + /// Drop all kids scopes belonged to this scope. + void DropKids(); + + // enumerate all the variables current contains. + std::vector LocalVarNames() const; + + // Rename variable to a new name + void Rename(const std::string& origin_name, + const std::string& new_name) const; + + // Rename variable to a new name and return the new name + std::string Rename(const std::string& origin_name) const; + + Variable* FindVarLocally(const std::string& name) const; + + private: + // Call Scope::NewScope for a sub-scope. + explicit Scope(Scope const* parent) : parent_(parent) {} + + mutable std::unordered_map vars_; + mutable std::list kids_; + Scope const* parent_{nullptr}; + + DISABLE_COPY_AND_ASSIGN(Scope); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d64acb130cb29eda34cb01ef0533c42f1f03dcf8 --- /dev/null +++ b/paddle/fluid/framework/scope_test.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +using paddle::framework::Scope; +using paddle::framework::Variable; + +TEST(Scope, VarsShadowing) { + Scope s; + Scope& ss1 = s.NewScope(); + Scope& ss2 = s.NewScope(); + + Variable* v0 = s.Var("a"); + Variable* v1 = ss1.Var("a"); + + EXPECT_NE(v0, v1); + + EXPECT_EQ(v0, s.FindVar("a")); + EXPECT_EQ(v1, ss1.FindVar("a")); + EXPECT_EQ(v0, ss2.FindVar("a")); +} + +TEST(Scope, FindVar) { + Scope s; + Scope& ss = s.NewScope(); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_EQ(nullptr, ss.FindVar("a")); + + ss.Var("a"); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_NE(nullptr, ss.FindVar("a")); +} + +TEST(Scope, FindScope) { + Scope s; + Scope& ss = s.NewScope(); + Variable* v = s.Var("a"); + + EXPECT_EQ(&s, s.FindScope(v)); + EXPECT_EQ(&s, ss.FindScope(v)); +} + +TEST(Scope, GetAllNames) { + Scope s; + Variable* v = s.Var("a"); + EXPECT_EQ(&s, s.FindScope(v)); + + std::vector ans = s.LocalVarNames(); + std::string str; + for (auto& var : ans) { + str += var; + } + + EXPECT_STREQ("a", str.c_str()); +} diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5d9e9a4951877e031ea6fdf529676fcb21e202f --- /dev/null +++ b/paddle/fluid/framework/selected_rows.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + SerializeToStream(os, selected_rows.value(), dev_ctx); +} + +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx) { + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h new file mode 100644 index 0000000000000000000000000000000000000000..f1a263962b2efc1ca828dd2eeb45495334ac1047 --- /dev/null +++ b/paddle/fluid/framework/selected_rows.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class SelectedRows { + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + } + + SelectedRows() { + height_ = 0; + value_.reset(new Tensor()); + } + + platform::Place place() const { return value_->place(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const Vector& rows() const { return rows_; } + + Vector* mutable_rows() { return &rows_; } + + void set_rows(const Vector& rows) { rows_ = rows; } + + DDim GetCompleteDims() const { + std::vector dims = vectorize(value_->dims()); + dims[0] = height_; + return make_ddim(dims); + } + + private: + // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. + // SelectedRows are simplely concated when adding together. Until a + // SelectedRows add a Tensor, will the duplicate rows be handled. + Vector rows_; + std::unique_ptr value_{nullptr}; + int64_t height_; +}; + +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d414f2a5934282b4d586e6a9f7f81e44afbc9305 --- /dev/null +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/selected_rows.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { + +class SelectedRowsTester : public ::testing::Test { + public: + virtual void SetUp() override { + std::vector rows{0, 4, 7}; + int64_t height = 10; + int64_t row_numel = 100; + selected_rows_.reset(new SelectedRows(rows, height)); + + Tensor* value = selected_rows_->mutable_value(); + value->mutable_data( + make_ddim({static_cast(rows.size()), row_numel}), place_); + } + + protected: + platform::CPUPlace place_; + std::unique_ptr selected_rows_{nullptr}; +}; + +TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } + +TEST_F(SelectedRowsTester, dims) { + ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100})); +} + +TEST_F(SelectedRowsTester, complete_dims) { + ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); +} + +TEST_F(SelectedRowsTester, SerializeAndDeseralize) { + SelectedRows dst_tensor; + platform::CPUDeviceContext cpu_ctx(place_); + std::ostringstream oss; + + SerializeToStream(oss, *selected_rows_, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, cpu_ctx); + + ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); + ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); + ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); + ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfd2334f1af19023c607d364172a4176be10f622 --- /dev/null +++ b/paddle/fluid/framework/shape_inference.cc @@ -0,0 +1,140 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/shape_inference.h" +#include "grad_op_desc_maker.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +DDim InferShapeContext::GetInputDim(const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); +} + +std::vector InferShapeContext::GetInputsDim( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); +} + +std::vector InferShapeContext::GetReaderDims( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader input '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->GetRepeatedDims(arg_names[0]); +} + +DDim InferShapeContext::GetInputsElementDim(const std::string &name, + int idx) const { + const std::vector &names = Inputs(name); + return this->GetDim(names[idx]); +} + +void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); +} + +void InferShapeContext::SetOutputsDim(const std::string &name, + const std::vector &dims) { + auto &names = Outputs(name); + SetDims(names, dims); +} + +void InferShapeContext::SetReaderDims(const std::string &name, + const std::vector &dims) { + const std::vector &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader output '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->SetRepeatedDims(arg_names[0], dims); +} + +std::vector InferShapeContext::GetInputVarPtrs( + const std::string &name) { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform( + arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { return this->GetVarPtr(name); }); + return res; +} + +std::vector InferShapeContext::GetOutputVarPtrs( + const std::string &name) { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform( + arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { return this->GetVarPtr(name); }); + return res; +} + +std::vector InferShapeContext::GetDims( + const std::vector &names) const { + std::vector ret; + ret.reserve(names.size()); + std::transform( + names.begin(), names.end(), std::back_inserter(ret), + [this](const std::string &name) { return this->GetDim(name); }); + return ret; +} + +void InferShapeContext::SetDims(const std::vector &names, + const std::vector &dims) { + size_t length = names.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (names[i] == framework::kEmptyVarName) { + continue; + } + SetDim(names[i], dims[i]); + } +} + +std::vector InferShapeContext::GetInputsVarType( + const std::string &name) const { + return GetVarTypes(Inputs(name)); +} + +std::vector InferShapeContext::GetOutputsVarType( + const std::string &name) const { + return GetVarTypes(Outputs(name)); +} + +std::vector InferShapeContext::GetVarTypes( + const std::vector &names) const { + std::vector retv; + retv.resize(names.size()); + std::transform(names.begin(), names.end(), retv.begin(), + std::bind(std::mem_fn(&InferShapeContext::GetVarType), this, + std::placeholders::_1)); + return retv; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..c907523325c8472f902517deebec9bc02168713c --- /dev/null +++ b/paddle/fluid/framework/shape_inference.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +using InferShapeVarPtr = boost::variant; + +class InferShapeContext { + public: + virtual ~InferShapeContext() = default; + virtual bool HasInput(const std::string &name) const = 0; + virtual bool HasOutput(const std::string &name) const = 0; + + std::vector GetInputsVarType( + const std::string &name) const; + std::vector GetOutputsVarType( + const std::string &name) const; + + virtual bool HasInputs(const std::string &name) const = 0; + virtual bool HasOutputs(const std::string &name) const = 0; + + DDim GetInputDim(const std::string &name) const; + std::vector GetInputsDim(const std::string &name) const; + std::vector GetReaderDims(const std::string &name) const; + DDim GetInputsElementDim(const std::string &name, int idx) const; + + void SetOutputDim(const std::string &name, const DDim &dim); + void SetOutputsDim(const std::string &name, const std::vector &dims); + void SetReaderDims(const std::string &name, const std::vector &dims); + + virtual AttrReader Attrs() const = 0; + virtual const std::vector &Inputs( + const std::string &name) const = 0; + virtual const std::vector &Outputs( + const std::string &name) const = 0; + + virtual void ShareLoD(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) const = 0; + + virtual bool IsRuntime() const = 0; + + std::vector GetInputVarPtrs(const std::string &name); + std::vector GetOutputVarPtrs(const std::string &name); + + // Note: In while op, we need this to be public + void SetDims(const std::vector &names, + const std::vector &dims); + + protected: + virtual DDim GetDim(const std::string &name) const = 0; + virtual void SetDim(const std::string &name, const DDim &dim) = 0; + virtual std::vector GetRepeatedDims(const std::string &name) const = 0; + virtual void SetRepeatedDims(const std::string &name, + const std::vector &dims) = 0; + + std::vector GetDims(const std::vector &names) const; + + std::vector GetVarTypes( + const std::vector &names) const; + + virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0; + + virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..a56091d3c629c4cedc13c465c84a646dc02cd094 --- /dev/null +++ b/paddle/fluid/framework/tensor.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework {} +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..44d2c7dae943a06eeab8ab1a1565f62b11de0af1 --- /dev/null +++ b/paddle/fluid/framework/tensor.h @@ -0,0 +1,227 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace framework { + +class LoDTensor; + +class Tensor { + public: + template + friend struct EigenTensor; + + template + friend struct EigenMatrix; + + template + friend struct EigenVector; + + public: + Tensor() : offset_(0) {} + + /*! Constructor with place should only be used in pybind. */ + explicit Tensor(const platform::Place& place) : offset_(0) { + holder_->set_place(place); + } + + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /*! Return a pointer to constant memory block. */ + template + inline const T* data() const; + + inline bool IsInitialized() const; + + inline void switch_place(platform::Place new_place); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + inline void* mutable_data(platform::Place place, std::type_index type); + + inline void* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + /*! Return the numel of the memory block. */ + inline int64_t numel() const; + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! The internal of two tensors share the same memory block. */ + inline Tensor& ShareDataWith(const Tensor& src); + + /** + * @brief Return a sub-tensor of the given tensor. + * + * @param[in] begin_idx The index of the start row(inclusive) to slice. + * The index number begins from 0. + * @param[in] end_idx The index of the end row(exclusive) to slice. + * The index number begins from 0. + */ + inline Tensor Slice(int begin_idx, int end_idx) const; + + platform::Place place() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::place() is called."); + return holder_->place(); + } + + std::type_index type() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::type() is called."); + return holder_->type(); + } + + // memory size returns the holding memory size in byte. + size_t memory_size() const; + + inline void check_memory_size() const; + + inline DataLayout layout() const { return layout_; } + + inline void set_layout(const DataLayout layout) { layout_ = layout; } + + private: + friend class LoDTensor; + + /** + * @note Placeholder hides type T, so it doesn't appear as a template + * parameter of Variable. + */ + struct Placeholder { + virtual ~Placeholder() = default; + virtual void* ptr() const = 0; + virtual size_t size() const = 0; + virtual std::type_index type() const = 0; + virtual platform::Place place() const = 0; + virtual void set_type(std::type_index type) = 0; + virtual void set_place(platform::Place place) = 0; + }; + + template + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(Place place, size_t size, std::type_index type) + : ptr_(static_cast(memory::Alloc(place, size)), + memory::PODDeleter(place)), + place_(place), + size_(size), + type_(type) { + PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", + (is_cpu_place(place_) ? "CPU" : "GPU")); + } + + virtual size_t size() const { return size_; } + virtual platform::Place place() const { return place_; } + virtual void* ptr() const { return static_cast(ptr_.get()); } + virtual std::type_index type() const { return type_; } + virtual void set_type(std::type_index type) { type_ = type; } + virtual void set_place(platform::Place place) { place_ = place; } + + /*! the pointer of memory block. */ + std::unique_ptr> ptr_; + + /*! the place of memory block. */ + platform::Place place_; + + /*! the size of memory block. */ + size_t size_; + + /* the current type of memory */ + std::type_index type_; + }; + + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /** + * @brief points to elements dimensions. + * + * @note dims_ do not indicate the memory block size. + */ + + DDim dims_; + + /** + * @brief the layout of memory block, default is NHWC. + * + * @note the memory allocation order, describe how weight/data is stored + * For example, in 4-D Tensor(rank=4), there are three commonly + * used layout. They are + * NCHW, NHWC, CHWN. + * N,C,H,W for respectively the batch size, the number of + * feature maps, the height. + */ + + DataLayout layout_ = DataLayout::kNHWC; + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ + size_t offset_; +}; + +inline void Tensor::switch_place(platform::Place new_place) { + if (holder_->place() == new_place) { + return; + } + + // TODO(tonyyang-svail): do memcpy here. + PADDLE_THROW("Not Implemented"); +} + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/tensor_impl.h" diff --git a/paddle/framework/tensor.md b/paddle/fluid/framework/tensor.md similarity index 100% rename from paddle/framework/tensor.md rename to paddle/fluid/framework/tensor.md diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..e69836292cd0f4ed99e87ee8e297021dac43b64f --- /dev/null +++ b/paddle/fluid/framework/tensor_impl.h @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +struct SizeOfTypeFunctor; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor functor; + size_t size = functor(type); + PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + +inline void Tensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory.\n" + "or maybe the required data-type mismatches the data already stored."); +} + +inline size_t Tensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; +} + +template +inline const T* Tensor::data() const { + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } + +template +inline T* Tensor::data() { + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + Resize(dims); + return mutable_data(place); +} + +template +inline T* Tensor::mutable_data(platform::Place place) { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(place, typeid(T))); +} + +inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_ENFORCE_GT( + numel(), 0, + "When calling this method, the Tensor's numel must be larger than zero. " + "Please check Tensor::Resize has been called first."); + int64_t size = numel() * SizeOfType(type); + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_gpu_place(place)) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); + } +#else + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +inline void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing"); + return mutable_data(place, holder_->type()); +} + +inline Tensor& Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; + return *this; +} + +inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); + PADDLE_ENFORCE_LT( + begin_idx, end_idx, + "The start row index must be lesser than the end row index."); + + if (dims_[0] == 1) { + return *this; + } else { + size_t base = numel() / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + dst.set_layout(layout_); + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); + return dst; + } +} + +inline Tensor& Tensor::Resize(const DDim& dims) { + dims_ = dims; + return *this; +} + +inline const DDim& Tensor::dims() const { return dims_; } + +inline int64_t Tensor::numel() const { return product(dims_); } + +inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + Tensor res; + res.ShareDataWith(src); + res.Resize(flatten_to_2d(src.dims(), num_col_dims)); + return res; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ed416e46f99f4d2ed50538a3e2c090ed8dd6fc3 --- /dev/null +++ b/paddle/fluid/framework/tensor_test.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor.h" +#include +#include + +namespace framework = paddle::framework; +namespace platform = paddle::platform; + +TEST(Tensor, Dims) { + framework::Tensor tt; + tt.Resize({2, 3, 4}); + framework::DDim dims = tt.dims(); + ASSERT_EQ(arity(dims), 3); + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(i + 2, dims[i]); + } +} + +TEST(Tensor, DataAssert) { + framework::Tensor src_tensor; + + bool caught = false; + try { + src_tensor.data(); + } catch (platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(Tensor, MutableData) { + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CUDAPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CUDAPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + } +#endif +} + +TEST(Tensor, ShareDataWith) { + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + // Try to share data form uninitialized tensor + bool caught = false; + try { + dst_tensor.ShareDataWith(src_tensor); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CPUPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CUDAPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +} + +TEST(Tensor, Slice) { + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({5, 3, 4}), + platform::CPUPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(1, 3); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 3); + EXPECT_EQ(slice_dims[0], 2); + EXPECT_EQ(slice_dims[1], 3); + EXPECT_EQ(slice_dims[2], 4); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = reinterpret_cast( + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CPUPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::CUDAPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif +} + +TEST(Tensor, ReshapeToMatrix) { + framework::Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { + src_ptr[i] = i; + } + framework::Tensor res = framework::ReshapeToMatrix(src, 2); + ASSERT_EQ(res.dims()[0], 2 * 3); + ASSERT_EQ(res.dims()[1], 4 * 9); +} + +TEST(Tensor, Layout) { + framework::Tensor src; + ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + src.set_layout(framework::DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); +} diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + Copy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu new file mode 100644 index 0000000000000000000000000000000000000000..537fb4614cac8bc898b277899f803a3b1846a00e --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cu @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + Copy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h new file mode 100644 index 0000000000000000000000000000000000000000..b7e772b6daad93dc915665d58d4a5722c74c0d2b --- /dev/null +++ b/paddle/fluid/framework/tensor_util.h @@ -0,0 +1,333 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * @param[in] ctx The device context contains device resources. + * + * @note Copy supports CPU <-> GPU, GPU <-> GPU. + */ +inline void Copy(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst) { + VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to " + << dst_place; + src.check_memory_size(); + + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + memory::Copy( + dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief Wrapper on + * Copy(const Tensor& src, const platform::Place& dst_place, + * const platform::DeviceContext& ctx, Tensor* dst); + * + * @param[in] src The external tensor. + * @param[in] dst_place The dst place. + * + * @note Copy supports CPU <-> GPU, GPU <-> GPU. + */ +inline void Copy(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + if (platform::is_gpu_place(src.place())) { + dev_ctx = pool.Get(src.place()); + } else { + dev_ctx = pool.Get(dst_place); + } + Copy(src, dst_place, *dev_ctx, dst); +} + +/** + * @brief Copy the content of an external vector to a tensor. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector will resize dst to an 1D tensor with the same + * size as src. + */ +template +inline void CopyFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, src_place, + src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + boost::get(dst_place), dst_ptr, src_place, src_ptr, + size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief CopyFromVector CPU vector -> CPU Tensor + */ +template +inline void CopyFromVector(const std::vector& src, Tensor* dst) { + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); +} + +/** + * @brief Copy the content of a tensor to a vector + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains device resources. + * + * * @note CopyFromVector assumes that the tensor has been resized + * before invoking. + */ +template +inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + boost::get(src.place()), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +/** + * @brief CopyToVector CPUTensor <-> CPU Vector + */ +template +inline void CopyToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); +} + +// Returns true if a tensor contains NAN, i.e., Not A Number. +bool HasNAN(const framework::Tensor& tensor); + +// Returns true if a tensor contains Inf, i.e., Infinity. +bool HasInf(const framework::Tensor& tensor); + +inline void SerializeToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + // TODO(typhoonzero): serialize to ostream + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.memory_size(); + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +struct DeserializedDataFunctor { + DeserializedDataFunctor(void** buf, Tensor* tensor, + const platform::Place& place) + : buf_(buf), tensor_(tensor), place_(place) {} + + template + void operator()() { + *buf_ = tensor_->mutable_data(place_); + } + + void** buf_; + Tensor* tensor_; + platform::Place place_; +}; + +inline void DeserializeFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + void* buf; + auto ctx = platform::CPUDeviceContext(); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + Tensor cpu_tensor; + cpu_tensor.Resize(framework::make_ddim(dims)); + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), cpu_tensor.memory_size()); + auto dst_place = dev_ctx.GetPlace(); + framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor); +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), tensor->memory_size()); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8764c692e875328fc98a7b67a018014af487f394 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +TEST(Copy, Tensor) { + Tensor src_tensor; + Tensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + src_tensor.set_layout(DataLayout::kAnyLayout); + + auto cpu_place = new platform::CPUPlace(); + Copy(src_tensor, *cpu_place, &dst_tensor); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + + Tensor slice_tensor = src_tensor.Slice(1, 2); + Copy(slice_tensor, *cpu_place, &dst_tensor); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + +#ifdef PADDLE_WITH_CUDA + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new platform::CUDAPlace(0); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + auto cpu_place = new platform::CPUPlace(); + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + } +#endif +} + +TEST(CopyFromVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CopyFromVector(src_vec, &cpu_tensor); + + // Compare Tensors + const int* cpu_ptr = cpu_tensor.data(); + const int* src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, &cpu_tensor); + cpu_ptr = cpu_tensor.data(); + src_ptr = src_vec.data(); + ASSERT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + delete cpu_place; + } + +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor cpu_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + CPUDeviceContext cpu_ctx(*cpu_place); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + + // Copy to GPUTensor + gpu_tensor.Resize(make_ddim({3, 3})); + auto gpu_place = new paddle::platform::CUDAPlace(); + CUDADeviceContext gpu_ctx(*gpu_place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + // Copy from GPU to CPU tensor for comparison + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* src_ptr = src_vec.data(); + const int* cpu_ptr = cpu_tensor.data(); + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + + cpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + gpu_tensor.Resize(make_ddim({2, 2})); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + src_ptr = src_vec.data(); + cpu_ptr = cpu_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, cpu_ptr); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + delete cpu_place; + delete gpu_place; + } +#endif +} + +TEST(CopyToVector, Tensor) { + using namespace paddle::framework; + using namespace paddle::platform; + { + Tensor src; + int* src_ptr = src.mutable_data({3, 3}, CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = i; + } + + CPUPlace place; + std::vector dst; + CopyToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + Tensor gpu_tensor; + CUDAPlace place; + CUDADeviceContext gpu_ctx(place); + CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + CopyToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + +TEST(HasNAN, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + + ASSERT_TRUE(HasNAN(src)); +} + +TEST(HasInf, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(HasInf(src)); +} + +TEST(Tensor, SerializeAndDeserialize) { + framework::Tensor src_tensor; + int array[6] = {1, 2, 3, 4, 5, 6}; + src_tensor.Resize({2, 3}); + int* src_ptr = src_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + src_ptr[i] = array[i]; + } + { + framework::Tensor dst_tensor; + auto place = new platform::CPUPlace(); + platform::CPUDeviceContext cpu_ctx(*place); + std::ostringstream oss; + SerializeToStream(oss, src_tensor, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, cpu_ctx); + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 5; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + ASSERT_EQ(dst_tensor.dims(), src_tensor.dims()); + delete place; + } +#ifdef PADDLE_WITH_CUDA + { + Tensor gpu_tensor; + gpu_tensor.Resize({2, 3}); + Tensor dst_tensor; + + auto gpu_place = new platform::CUDAPlace(); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + + Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + std::ostringstream oss; + SerializeToStream(oss, gpu_tensor, gpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, gpu_ctx); + + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + delete gpu_place; + } +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..1982b642bcd1f2e21a684b701d7bd603f0c2c894 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +static __global__ void FillNAN(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = NAN; +} +static __global__ void FillInf(float* buf) { + buf[0] = 0.0; + buf[1] = INFINITY; + buf[2] = 0.5; +} + +TEST(HasNAN, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasNAN(tensor)); +} + +TEST(HasInf, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasInf(tensor)); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c4de41b0c41fa3eeaf6c77def9c728dd9976895 --- /dev/null +++ b/paddle/fluid/framework/threadpool.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/threadpool.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +std::unique_ptr ThreadPool::threadpool_(nullptr); +std::once_flag ThreadPool::init_flag_; + +ThreadPool* ThreadPool::GetInstance() { + std::call_once(init_flag_, &ThreadPool::Init); + return threadpool_.get(); +} + +void ThreadPool::Init() { + if (threadpool_.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool_.reset(new ThreadPool(num_threads)); + } +} + +ThreadPool::ThreadPool(int num_threads) + : total_threads_(num_threads), idle_threads_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } +} + +ThreadPool::~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } +} + +void ThreadPool::Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); +} + +void ThreadPool::TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = std::move(tasks_.front()); + tasks_.pop(); + + --idle_threads_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++idle_threads_; + if (Done()) { + completed_.notify_all(); + } + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..e88e6c01f02deb77278a02ba81ee62ddfcf42eb8 --- /dev/null +++ b/paddle/fluid/framework/threadpool.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +// ThreadPool maintains a queue of tasks, and runs them using a fixed +// number of threads. +class ThreadPool { + public: + using Task = std::packaged_task()>; + + // Returns the singleton of ThreadPool. + static ThreadPool* GetInstance(); + + ~ThreadPool(); + + // Returns the number of threads created by the constructor. + size_t Threads() const { return total_threads_; } + + // Returns the number of currently idle threads. + size_t IdleThreads() { + std::unique_lock lock(mutex_); + return idle_threads_; + } + + // Run pushes a function to the task queue and returns a std::future + // object. To wait for the completion of the task, call + // std::future::wait(). + template + std::future Run(Callback fn) { + auto f = this->RunAndGetException(fn); + return std::async(std::launch::deferred, ExceptionHandler(std::move(f))); + } + + template + std::future> RunAndGetException( + Callback fn) { + std::unique_lock lock(mutex_); + Task task([fn]() -> std::unique_ptr { + try { + fn(); + return nullptr; + } catch (platform::EnforceNotMet ex) { + return std::unique_ptr( + new platform::EnforceNotMet(ex)); + } catch (...) { + LOG(FATAL) + << "Unexpected exception is catched in thread pool. All " + "throwable exception in Fluid should be an EnforceNotMet."; + } + }); + std::future> f = task.get_future(); + tasks_.push(std::move(task)); + lock.unlock(); + scheduled_.notify_one(); + return f; + } + + // Wait until all the tasks are completed. + void Wait(); + + private: + struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + LOG(FATAL) << "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception.\n" + "The default exception handler is LOG(FATAL)." + << ex->what(); + } + } + }; + + DISABLE_COPY_AND_ASSIGN(ThreadPool); + + explicit ThreadPool(int num_threads); + + // If the task queue is empty and avaialbe is equal to the number of + // threads, means that all tasks are completed. Note: this function + // is not thread-safe. Returns true if all tasks are completed. + // Note: don't delete the data member total_threads_ and use + // threads_.size() instead; because you'd need to lock the mutex + // before accessing threads_. + bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; } + + // The constructor starts threads to run TaskLoop, which retrieves + // and runs tasks from the queue. + void TaskLoop(); + + // Init is called by GetInstance. + static void Init(); + + private: + static std::unique_ptr threadpool_; + static std::once_flag init_flag_; + + std::vector> threads_; + const size_t total_threads_; + size_t idle_threads_; + + std::queue tasks_; + std::mutex mutex_; + bool running_; + std::condition_variable scheduled_; + std::condition_variable completed_; +}; + +// Run a function asynchronously. +// NOTE: The function must return void. If the function need to return a value, +// you can use lambda to capture a value pointer. +template +std::future Async(Callback callback) { + return ThreadPool::GetInstance()->Run(callback); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc similarity index 100% rename from paddle/framework/threadpool_test.cc rename to paddle/fluid/framework/threadpool_test.cc diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..786d78a6440de60abea44f7b8fccb90d455b488c --- /dev/null +++ b/paddle/fluid/framework/type_defs.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { +class OperatorBase; +class OpDesc; +class InferShapeContext; +class BlockDesc; + +using VariableNameMap = std::map>; + +// The order should be as same as framework.proto +using Attribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t>; + +using AttributeMap = std::unordered_map; + +using OpCreator = std::function; + +using GradOpMakerFN = std::function>( + const OpDesc&, const std::unordered_set& /*no_grad_set*/, + std::unordered_map* /*grad_to_var*/, + const std::vector& grad_block)>; + +using InferVarTypeFN = + std::function; + +using InferShapeFN = std::function; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ec9b2ced94c4176b64996827dcb79f1d756be6b --- /dev/null +++ b/paddle/fluid/framework/var_desc.cc @@ -0,0 +1,259 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); } + +void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); } + +void VarDesc::SetShape(const std::vector &dims) { + VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); +} + +void VarDesc::SetTensorDescNum(size_t num) { + switch (desc_.type()) { + case proto::VarDesc::READER: { + auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor(); + lod_tensors_ptr->Clear(); + for (size_t i = 0; i < num; ++i) { + lod_tensors_ptr->Add(); + } + return; + } break; + default: + PADDLE_THROW( + "Setting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +size_t VarDesc::GetTensorDescNum() const { + switch (desc_.type()) { + case proto::VarDesc::READER: + return desc_.reader().lod_tensor_size(); + break; + default: + PADDLE_THROW( + "Getting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetShapes( + const std::vector> &multiple_dims) { + if (multiple_dims.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_dims.size()); + } + std::vector tensors = mutable_tensor_descs(); + for (size_t i = 0; i < multiple_dims.size(); ++i) { + VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims()); + } +} + +std::vector VarDesc::GetShape() const { + return RepeatedToVector(tensor_desc().dims()); +} + +std::vector> VarDesc::GetShapes() const { + std::vector descs = tensor_descs(); + std::vector> res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(RepeatedToVector(tensor_desc.dims())); + } + return res; +} + +void VarDesc::SetDataType(proto::DataType data_type) { + mutable_tensor_desc()->set_data_type(data_type); +} + +void VarDesc::SetDataTypes( + const std::vector &multiple_data_type) { + if (multiple_data_type.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_data_type.size()); + } + std::vector tensor_descs = mutable_tensor_descs(); + for (size_t i = 0; i < multiple_data_type.size(); ++i) { + tensor_descs[i]->set_data_type(multiple_data_type[i]); + } +} + +proto::DataType VarDesc::GetDataType() const { + return tensor_desc().data_type(); +} + +std::vector VarDesc::GetDataTypes() const { + std::vector descs = tensor_descs(); + std::vector res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(tensor_desc.data_type()); + } + return res; +} + +void VarDesc::SetLoDLevel(int32_t lod_level) { + switch (desc_.type()) { + case proto::VarDesc::LOD_TENSOR: + desc_.mutable_lod_tensor()->set_lod_level(lod_level); + break; + case proto::VarDesc::LOD_TENSOR_ARRAY: + desc_.mutable_tensor_array()->set_lod_level(lod_level); + break; + default: + PADDLE_THROW( + "Setting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { + if (multiple_lod_level.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_lod_level.size()); + } + switch (desc_.type()) { + case proto::VarDesc::READER: { + size_t i = 0; + for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) { + lod_tensor.set_lod_level(multiple_lod_level[i++]); + } + } break; + default: + PADDLE_THROW( + "Setting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +int32_t VarDesc::GetLoDLevel() const { + switch (desc_.type()) { + case proto::VarDesc::LOD_TENSOR: + return desc_.lod_tensor().lod_level(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.tensor_array().lod_level(); + default: + PADDLE_THROW( + "Getting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::GetLoDLevels() const { + std::vector res; + switch (desc_.type()) { + case proto::VarDesc::READER: + res.reserve(desc_.reader().lod_tensor_size()); + for (auto &lod_tensor : desc_.reader().lod_tensor()) { + res.push_back(lod_tensor.lod_level()); + } + return res; + break; + default: + PADDLE_THROW( + "Getting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +const proto::TensorDesc &VarDesc::tensor_desc() const { + PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); + switch (desc_.type()) { + case proto::VarDesc::SELECTED_ROWS: + return desc_.selected_rows(); + case proto::VarDesc::LOD_TENSOR: + return desc_.lod_tensor().tensor(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.tensor_array().tensor(); + default: + PADDLE_THROW( + "Getting 'tensor_desc' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::tensor_descs() const { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type()) { + case proto::VarDesc::READER: + for (const auto &lod_tensor : desc_.reader().lod_tensor()) { + res.push_back(lod_tensor.tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +proto::TensorDesc *VarDesc::mutable_tensor_desc() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + switch (desc_.type()) { + case proto::VarDesc::SELECTED_ROWS: + return desc_.mutable_selected_rows(); + case proto::VarDesc::LOD_TENSOR: + return desc_.mutable_lod_tensor()->mutable_tensor(); + case proto::VarDesc::LOD_TENSOR_ARRAY: + return desc_.mutable_tensor_array()->mutable_tensor(); + default: + PADDLE_THROW( + "Getting 'mutable_tensor_desc' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +std::vector VarDesc::mutable_tensor_descs() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type()) { + case proto::VarDesc::READER: + for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) { + res.push_back(lod_tensor.mutable_tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..cdb1bc3ec09c890f2166190d591b6e6ee8b668a0 --- /dev/null +++ b/paddle/fluid/framework/var_desc.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { + +// convert between std::vector and protobuf repeated. +template +inline std::vector RepeatedToVector( + const google::protobuf::RepeatedField &repeated_field) { + std::vector ret; + ret.reserve(repeated_field.size()); + std::copy(repeated_field.begin(), repeated_field.end(), + std::back_inserter(ret)); + return ret; +} + +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (const auto &elem : vec) { + *repeated_field->Add() = elem; + } +} + +// Specialize vector. +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (auto elem : vec) { + *repeated_field->Add() = elem; + } +} + +class VarDesc { + public: + explicit VarDesc(const std::string &name) { + desc_.set_name(name); + desc_.set_type(proto::VarDesc::LOD_TENSOR); + } + + explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {} + + proto::VarDesc *Proto() { return &desc_; } + + std::string Name() const { return desc_.name(); } + + void SetName(std::string name) { desc_.set_name(name); } + + void SetTensorDescNum(size_t num); + + size_t GetTensorDescNum() const; + + void SetShape(const std::vector &dims); + + void SetShapes(const std::vector> &multiple_dims); + + std::vector GetShape() const; + + std::vector> GetShapes() const; + + void SetDataType(proto::DataType data_type); + + void SetDataTypes(const std::vector &multiple_data_type); + + proto::DataType GetDataType() const; + + std::vector GetDataTypes() const; + + void SetLoDLevel(int32_t lod_level); + + void SetLoDLevels(const std::vector &multiple_lod_level); + + int32_t GetLoDLevel() const; + + std::vector GetLoDLevels() const; + + proto::VarDesc::VarType GetType() const; + + void SetType(proto::VarDesc::VarType type); + + bool Persistable() const { return desc_.persistable(); } + + void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } + + private: + const proto::TensorDesc &tensor_desc() const; + std::vector tensor_descs() const; + proto::TensorDesc *mutable_tensor_desc(); + std::vector mutable_tensor_descs(); + + proto::VarDesc desc_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h new file mode 100644 index 0000000000000000000000000000000000000000..2dc4de529814bb0f7a5193b8e216343a4b1b3503 --- /dev/null +++ b/paddle/fluid/framework/var_type.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { +inline proto::VarDesc::VarType ToVarType(std::type_index type) { + if (type.hash_code() == typeid(LoDTensor).hash_code()) { + return proto::VarDesc_VarType_LOD_TENSOR; + } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) { + return proto::VarDesc_VarType_LOD_RANK_TABLE; + } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) { + return proto::VarDesc_VarType_LOD_TENSOR_ARRAY; + } else if (type.hash_code() == typeid(SelectedRows).hash_code()) { + return proto::VarDesc_VarType_SELECTED_ROWS; + } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) { + return proto::VarDesc_VarType_READER; + } else { + PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); + } +} + +template +inline void VisitVarType(const framework::Variable& var, Visitor visitor) { + switch (ToVarType(var.Type())) { + case proto::VarDesc_VarType_LOD_TENSOR: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_LOD_RANK_TABLE: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_LOD_TENSOR_ARRAY: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_SELECTED_ROWS: + visitor(var.Get()); + return; + case proto::VarDesc_VarType_READER: + visitor(var.Get()); + return; + default: + PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h new file mode 100644 index 0000000000000000000000000000000000000000..44fd4cd622cbada7f10ade8928a69d4e3d1d9ec0 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +class VarTypeInference { + public: + virtual ~VarTypeInference() {} + virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ee589c821a77af7f6714fefd7bebff89218dad8 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/var_type_inference.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SumOpVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc &op_desc, BlockDesc *block) const override { + auto &inputs = op_desc.Input("X"); + auto default_var_type = proto::VarDesc::SELECTED_ROWS; + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [block](const std::string &name) { + return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR; + }); + if (any_input_is_lod_tensor) { + default_var_type = proto::VarDesc::LOD_TENSOR; + } + + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(default_var_type); + } +}; +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, + paddle::framework::SumOpMaker); + +namespace paddle { +namespace framework { + +TEST(InferVarType, sum_op) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"test_a", "test_b", "test_c"}); + op->SetOutput("Out", {"test_out"}); + + prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarDesc::SELECTED_ROWS, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR); + op->InferVarType(prog.MutableBlock(0)); + ASSERT_EQ(proto::VarDesc::LOD_TENSOR, + prog.MutableBlock(0)->Var("test_out")->GetType()); +} + +TEST(InferVarType, sum_op_without_infer_var_type) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum_without_infer_var_type"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR, + prog.MutableBlock(0)->Var("test2_out")->GetType()); +} + +} // namespace framework +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h new file mode 100644 index 0000000000000000000000000000000000000000..9fb8ca92d68203a0bf8ec6ecd30072374b5fe4af --- /dev/null +++ b/paddle/fluid/framework/variable.h @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +class Variable { + public: + template + const T& Get() const { + PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); + return *static_cast(holder_->Ptr()); + } + + bool IsInitialized() const { return holder_ != nullptr; } + + template + T* GetMutable() { + if (!IsType()) { + holder_.reset(new PlaceholderImpl(new T())); + } + return static_cast(holder_->Ptr()); + } + + template + bool IsType() const { + return holder_ != nullptr && + std::type_index(typeid(T)) == std::type_index(holder_->Type()); + } + + void Clear() { holder_.reset(); } + + std::type_index Type() const { + PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); + return holder_->Type(); + } + + private: + struct Placeholder { + virtual ~Placeholder() {} + virtual const std::type_info& Type() const = 0; + virtual void* Ptr() const = 0; + }; + + // Placeholder hides type T, so it doesn't appear as a template + // parameter of Variable. + template + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} + + virtual const std::type_info& Type() const { return type_; } + virtual void* Ptr() const { return static_cast(ptr_.get()); } + + std::unique_ptr ptr_; + const std::type_info& type_; + }; + + std::unique_ptr + holder_; // pointers to a PlaceholderImpl object indeed. + + // name_ is only meaningful with a Scope and accessible by it. + // + // NOTE: Please don't expose name_ by adding methods like + // Variable::Name or Scope::VarName! A variable could have a human + // readable name or an auto-generated scope-unique name. In the + // former case, the caller knows the name and doesn't need to access + // the name; in the latter case, the variable should be identified + // by its address but not the unreadable name. + friend class Scope; + const std::string* name_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/variable.md b/paddle/fluid/framework/variable.md similarity index 100% rename from paddle/framework/variable.md rename to paddle/fluid/framework/variable.md diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8c14e506fd7fd480012135b316479e45bed5584e --- /dev/null +++ b/paddle/fluid/framework/variable_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/variable.h" + +TEST(Variable, GetMutable) { + using paddle::framework::Variable; + + struct Tensor { + int content_; + }; + + std::unique_ptr v(new Variable()); + + Tensor* t = v->GetMutable(); + t->content_ = 1234; + + const Tensor& tt = v->Get(); + EXPECT_EQ(1234, tt.content_); + + std::string* s = v->GetMutable(); + *s = "hello"; + + const std::string& ss = v->Get(); + EXPECT_EQ("hello", ss); +} diff --git a/paddle/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt similarity index 100% rename from paddle/inference/CMakeLists.txt rename to paddle/fluid/inference/CMakeLists.txt diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..58d7ab40bfa67595a9c7c61ed431a7cf9509e1f7 --- /dev/null +++ b/paddle/fluid/inference/io.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/io.h" + +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace inference { + +void ReadBinaryFile(const std::string& filename, std::string& contents) { + VLOG(3) << "loading model from " << filename; + std::ifstream inputfs(filename, std::ios::in | std::ios::binary); + inputfs.seekg(0, std::ios::end); + contents.clear(); + contents.resize(inputfs.tellg()); + inputfs.seekg(0, std::ios::beg); + inputfs.read(&contents[0], contents.size()); + inputfs.close(); +} + +bool IsParameter(const framework::VarDesc* var, + const framework::ProgramDesc& main_program) { + if (var->Persistable()) { + // There are many unreachable variables in the program + for (size_t i = 0; i < main_program.Size(); ++i) { + const framework::BlockDesc& block = main_program.Block(i); + for (auto* op : block.AllOps()) { + if (op->Type() == framework::kFeedOpType) { + continue; + } + for (auto input_argument_name : op->InputArgumentNames()) { + if (input_argument_name == var->Name()) { + return true; + } + } + } + } + } + return false; +} + +void LoadPersistables(framework::Executor& executor, + framework::Scope& scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename) { + const framework::BlockDesc& global_block = main_program.Block(0); + + framework::ProgramDesc* load_program = new framework::ProgramDesc(); + framework::BlockDesc* load_block = load_program->MutableBlock(0); + std::vector paramlist; + + for (auto* var : global_block.AllVars()) { + if (IsParameter(var, main_program)) { + VLOG(3) << "parameter's name: " << var->Name(); + + framework::VarDesc* new_var = load_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + new_var->SetLoDLevel(var->GetLoDLevel()); + new_var->SetPersistable(true); + + if (!param_filename.empty()) { + paramlist.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); + op->CheckAttrs(); + } + } + } + + if (!param_filename.empty()) { + // sort paramlist to have consistent ordering + std::sort(paramlist.begin(), paramlist.end()); + // append just the load_combine op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", paramlist); + op->SetAttr("file_path", {param_filename}); + op->CheckAttrs(); + } + + executor.Run(*load_program, &scope, 0, true, true); + + VLOG(3) << "Ran loading successfully"; + delete load_program; +} + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& dirname) { + std::string model_filename = dirname + "/__model__"; + std::string program_desc_str; + ReadBinaryFile(model_filename, program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + + LoadPersistables(executor, scope, *main_program, dirname, ""); + return main_program; +} + +std::unique_ptr Load( + framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename) { + std::string model_filename = prog_filename; + std::string program_desc_str; + ReadBinaryFile(model_filename, program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + + LoadPersistables(executor, scope, *main_program, "", param_filename); + return main_program; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7864060646d9a480ce6ced6a1f4364e83938c0 --- /dev/null +++ b/paddle/fluid/inference/io.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace inference { + +void LoadPersistables(framework::Executor& executor, + framework::Scope& scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename); + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& dirname); + +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename); + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fe76afb582a13b741ab086f0c62d77e86d4e8bb --- /dev/null +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -0,0 +1,34 @@ +function(inference_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests) + set(arg_list "") + if(inference_test_ARGS) + foreach(arg ${inference_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_inference_${TARGET_NAME}${arg} + SRCS test_inference_${TARGET_NAME}.cc + DEPS ARCHIVE_START paddle_fluid ARCHIVE_END + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) + set_tests_properties(test_inference_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + endforeach() +endfunction(inference_test) + +inference_test(fit_a_line) +inference_test(image_classification ARGS vgg resnet) +inference_test(label_semantic_roles) +inference_test(recognize_digits ARGS mlp) +inference_test(recommender_system) +inference_test(rnn_encoder_decoder) +inference_test(understand_sentiment) +inference_test(word2vec) diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa18e69b3ac7e984172dd14a3cb8d48158dfb471 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, fit_a_line) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // The second dim of the input tensor should be 13 + // The input data should be >= 0 + int64_t batch_size = 10; + SetupTensor( + input, {batch_size, 13}, static_cast(0), static_cast(10)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc new file mode 100644 index 0000000000000000000000000000000000000000..27f17712bca4103e2556cb375339ca785f53bd4f --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, image_classification) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [0.0, 1.0]. + SetupTensor(input, + {batch_size, 3, 32, 32}, + static_cast(0), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc new file mode 100644 index 0000000000000000000000000000000000000000..55acd95f50906b13a5a906e0bcc2e73a0c7f8ef2 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, label_semantic_roles) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, + ctx_p2, mark; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor(word, lod, static_cast(0), static_cast(1)); + SetupLoDTensor( + predicate, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_n2, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_n1, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_0, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_p1, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(ctx_p2, lod, static_cast(0), static_cast(1)); + SetupLoDTensor(mark, lod, static_cast(0), static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word); + cpu_feeds.push_back(&predicate); + cpu_feeds.push_back(&ctx_n2); + cpu_feeds.push_back(&ctx_n1); + cpu_feeds.push_back(&ctx_0); + cpu_feeds.push_back(&ctx_p1); + cpu_feeds.push_back(&ctx_p2); + cpu_feeds.push_back(&mark); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..99cf0f3095bf7f93d53272e0ae13242484d7128c --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recognize_digits) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor(input, + {batch_size, 1, 28, 28}, + static_cast(-1), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} + +TEST(inference, recognize_digits_combine) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor( + input, {1, 28, 28}, static_cast(-1), static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference( + dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference( + dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc new file mode 100644 index 0000000000000000000000000000000000000000..9208c2a59965ad5296238a23e89cd290b5e19740 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recommender_system) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id, + category_id, movie_title; + + // Use the first data from paddle.dataset.movielens.test() as input + std::vector user_id_data = {1}; + SetupTensor(user_id, {batch_size, 1}, user_id_data); + + std::vector gender_id_data = {1}; + SetupTensor(gender_id, {batch_size, 1}, gender_id_data); + + std::vector age_id_data = {0}; + SetupTensor(age_id, {batch_size, 1}, age_id_data); + + std::vector job_id_data = {10}; + SetupTensor(job_id, {batch_size, 1}, job_id_data); + + std::vector movie_id_data = {783}; + SetupTensor(movie_id, {batch_size, 1}, movie_id_data); + + std::vector category_id_data = {10, 8, 9}; + SetupLoDTensor(category_id, {3, 1}, {{0, 3}}, category_id_data); + + std::vector movie_title_data = {1069, 4140, 2923, 710, 988}; + SetupLoDTensor(movie_title, {5, 1}, {{0, 5}}, movie_title_data); + + std::vector cpu_feeds; + cpu_feeds.push_back(&user_id); + cpu_feeds.push_back(&gender_id); + cpu_feeds.push_back(&age_id); + cpu_feeds.push_back(&job_id); + cpu_feeds.push_back(&movie_id); + cpu_feeds.push_back(&category_id); + cpu_feeds.push_back(&movie_title); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc new file mode 100644 index 0000000000000000000000000000000000000000..c88ca30cb781c1980d960c5e4e1137dcfd54afac --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, rnn_encoder_decoder) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word_data, trg_word; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor( + word_data, lod, static_cast(0), static_cast(1)); + SetupLoDTensor( + trg_word, lod, static_cast(0), static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word_data); + cpu_feeds.push_back(&trg_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b29d52880cef1710696074ed8b2fdecf4f9fcca --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, understand_sentiment) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, 4, 10}}; + SetupLoDTensor(words, lod, static_cast(0), static_cast(10)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&words); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc new file mode 100644 index 0000000000000000000000000000000000000000..93376b6824daf000dd9996c17ca9737b5b600e10 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, word2vec) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word; + paddle::framework::LoD lod{{0, 1}}; + int64_t dict_size = 2072; // Hard-coding the size of dictionary + + SetupLoDTensor(first_word, lod, static_cast(0), dict_size); + SetupLoDTensor(second_word, lod, static_cast(0), dict_size); + SetupLoDTensor(third_word, lod, static_cast(0), dict_size); + SetupLoDTensor(fourth_word, lod, static_cast(0), dict_size); + + std::vector cpu_feeds; + cpu_feeds.push_back(&first_word); + cpu_feeds.push_back(&second_word); + cpu_feeds.push_back(&third_word); + cpu_feeds.push_back(&fourth_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..a6c93aa0737f79ca1d626862256d3c79a36868ae --- /dev/null +++ b/paddle/fluid/inference/tests/test_helper.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/io.h" + +template +void SetupTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + T lower, + T upper) { + srand(time(0)); + T* input_ptr = input.mutable_data(dims, paddle::platform::CPUPlace()); + for (int i = 0; i < input.numel(); ++i) { + input_ptr[i] = + (static_cast(rand()) / static_cast(RAND_MAX)) * (upper - lower) + + lower; + } +} + +template +void SetupTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + std::vector& data) { + CHECK_EQ(paddle::framework::product(dims), static_cast(data.size())); + T* input_ptr = input.mutable_data(dims, paddle::platform::CPUPlace()); + memcpy(input_ptr, data.data(), input.numel() * sizeof(T)); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor& input, + paddle::framework::LoD& lod, + T lower, + T upper) { + input.set_lod(lod); + int dim = lod[0][lod[0].size() - 1]; + SetupTensor(input, {dim, 1}, lower, upper); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + paddle::framework::LoD lod, + std::vector& data) { + const size_t level = lod.size() - 1; + CHECK_EQ(dims[0], static_cast((lod[level]).back())); + input.set_lod(lod); + SetupTensor(input, dims, data); +} + +template +void CheckError(paddle::framework::LoDTensor& output1, + paddle::framework::LoDTensor& output2) { + // Check lod information + EXPECT_EQ(output1.lod(), output2.lod()); + + EXPECT_EQ(output1.dims(), output2.dims()); + EXPECT_EQ(output1.numel(), output2.numel()); + + T err = static_cast(0); + if (typeid(T) == typeid(float)) { + err = 1E-3; + } else if (typeid(T) == typeid(double)) { + err = 1E-6; + } else { + err = 0; + } + + size_t count = 0; + for (int64_t i = 0; i < output1.numel(); ++i) { + if (fabs(output1.data()[i] - output2.data()[i]) > err) { + count++; + } + } + EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; +} + +template +void TestInference(const std::string& dirname, + const std::vector& cpu_feeds, + std::vector& cpu_fetchs) { + // 1. Define place, executor, scope + auto place = Place(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + if (IsCombined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // Users are free to specify different filename + // (provided: the filenames are changed in the python api as well: io.py) + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } + + // 3. Get the feed_target_names and fetch_target_names + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + // 4. Prepare inputs: set up maps for feed targets + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + + // 5. Define Tensor to get the outputs: set up maps for fetch targets + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + + // 6. Run the inference program + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + + delete scope; +} diff --git a/paddle/platform/.clang-format b/paddle/fluid/memory/.clang-format similarity index 100% rename from paddle/platform/.clang-format rename to paddle/fluid/memory/.clang-format diff --git a/paddle/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt similarity index 100% rename from paddle/memory/CMakeLists.txt rename to paddle/fluid/memory/CMakeLists.txt diff --git a/paddle/memory/README.md b/paddle/fluid/memory/README.md similarity index 100% rename from paddle/memory/README.md rename to paddle/fluid/memory/README.md diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt similarity index 100% rename from paddle/memory/detail/CMakeLists.txt rename to paddle/fluid/memory/detail/CMakeLists.txt diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cee8271d27014815b19175ef93759d6a07b7e73 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -0,0 +1,329 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "glog/logging.h" + +namespace paddle { +namespace memory { +namespace detail { + +BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, + size_t min_chunk_size, size_t max_chunk_size) + : min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + cache_(system_allocator->UseGpu()), + system_allocator_(std::move(system_allocator)) {} + +BuddyAllocator::~BuddyAllocator() { + VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + pool_.erase(pool_.begin()); + } +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + VLOG(10) << "Allocate from system allocator."; + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(); + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + } else { + VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void BuddyAllocator::Free(void* p) { + // Point back to metadata + auto block = static_cast(p)->metadata(); + + // Acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Free from address " << block; + + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + VLOG(10) << "Free directly from system allocator"; + system_allocator_->Free(block, block->total_size(cache_), + block->index(cache_)); + + // Invalidate GPU allocation from cache + cache_.invalidate(block); + + return; + } + + block->mark_as_free(cache_); + + total_used_ -= block->total_size(cache_); + total_free_ += block->total_size(cache_); + + // Trying to merge the right buddy + if (block->has_right_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); + + auto right_buddy = block->right_buddy(cache_); + + if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(right_buddy->index(cache_), + right_buddy->total_size(cache_), + right_buddy)); + + // merge its right buddy to the block + block->merge(cache_, right_buddy); + } + } + + // Trying to merge the left buddy + if (block->has_left_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); + + auto left_buddy = block->left_buddy(cache_); + + if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(left_buddy->index(cache_), + left_buddy->total_size(cache_), left_buddy)); + + // merge the block to its left buddy + left_buddy->merge(cache_, block); + block = left_buddy; + } + } + + // Dumping this block into pool + VLOG(10) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; + pool_.insert( + IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); + + // Clean up if existing too much free memory + + // Prefer freeing fallback allocation first + CleanIdleFallBackAlloc(); + + // Free normal allocation + CleanIdleNormalAlloc(); +} + +size_t BuddyAllocator::Used() { return total_used_; } + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(index, size); + + VLOG(10) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +#ifdef PADDLE_WITH_CUDA + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } + } +#endif + + // Allocate a new maximum sized block + size_t index = 0; + void* p = system_allocator_->Alloc(index, max_chunk_size_); + + if (p == nullptr) return pool_.end(); + + VLOG(10) << "Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + max_chunk_size_, nullptr, nullptr); + + // gpu fallback allocation + if (system_allocator_->UseGpu() && + static_cast(p)->index(cache_) == 1) { + fallback_alloc_count_++; + } + + total_free_ += max_chunk_size_; + + // dump the block into pool + return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); + + // no match chunk memory + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + // find suitable one + if (std::get<1>(*it) >= size) { + return it; + } + // update and continue + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + pool_.erase(it); + + VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + block->split(cache_, size); + + VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; + block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; + + pool_.insert( + IndexSizeAddress(block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_))); + } + } + + return block; +} + +void BuddyAllocator::CleanIdleFallBackAlloc() { + // If fallback allocation does not exist, return directly + if (!fallback_alloc_count_) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + return; + } + + VLOG(10) << "Return block " << block << " to fallback allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + fallback_alloc_count_--; + + // If no fall allocation exists, return directly + if (!fallback_alloc_count_) return; + } +} + +void BuddyAllocator::CleanIdleNormalAlloc() { + auto shall_free_alloc = [&]() -> bool { + // free all fallback allocations + if (fallback_alloc_count_ > 0) { + return true; + } + // keep 2x overhead if we haven't fallen back + if ((total_used_ + max_chunk_size_) * 2 < total_free_) { + return true; + } + return false; + }; + + if (!shall_free_alloc()) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + VLOG(10) << "Return block " << block << " to base allocator."; + + system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= max_chunk_size_; + + if (!shall_free_alloc()) return; + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..644d79330680787f717920652708c0dd5bee1833 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "paddle/fluid/memory/detail/meta_data.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" + +#include +#include +#include +#include + +namespace paddle { +namespace memory { +namespace detail { + +class BuddyAllocator { + public: + BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size, + size_t max_chunk_size); + + ~BuddyAllocator(); + + public: + void* Alloc(size_t unaligned_size); + void Free(void* ptr); + size_t Used(); + + public: + // Disable copy and assignment + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + + private: + // Tuple (allocator index, memory size, memory address) + using IndexSizeAddress = std::tuple; + // Each element in PoolSet is a free allocation + using PoolSet = std::set; + + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); + + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(); + + /** + * \brief Find the suitable chunk from existing pool and split + * it to left and right buddies + * + * \param it the iterator of pool list + * \param size the size of allocation + * + * \return the left buddy address + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); + + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); + + /*! \brief Clean idle fallback allocation */ + void CleanIdleFallBackAlloc(); + + /*! \brief Clean idle normal allocation */ + void CleanIdleNormalAlloc(); + + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + private: + /** + * \brief A list of free allocation + * + * \note Only store free chunk memory in pool + */ + PoolSet pool_; + + /*! Record fallback allocation count for auto-scaling */ + size_t fallback_alloc_count_ = 0; + + private: + /*! Unify the metadata format between GPU and CPU allocations */ + MetadataCache cache_; + + private: + /*! Allocate CPU/GPU memory from system */ + SystemAllocator* system_allocator_; + std::mutex mutex_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc new file mode 100644 index 0000000000000000000000000000000000000000..23388cdd5b7c44ff91e10aadaa8cc25d8ef29d14 --- /dev/null +++ b/paddle/fluid/memory/detail/memory_block.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "paddle/fluid/memory/detail/meta_data.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy) { + cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, + static_cast(left_buddy), + static_cast(right_buddy))); +} + +MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { + return cache.load(this).type; +} + +size_t MemoryBlock::size(MetadataCache& cache) const { + return cache.load(this).size; +} + +size_t MemoryBlock::total_size(MetadataCache& cache) const { + return cache.load(this).total_size; +} + +MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { + return cache.load(this).left_buddy; +} + +MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { + return cache.load(this).right_buddy; +} + +void MemoryBlock::split(MetadataCache& cache, size_t size) { + // make sure the split fits + PADDLE_ASSERT(total_size(cache) >= size); + + // bail out if there is no room for another partition + if (total_size(cache) - size <= sizeof(Metadata)) { + return; + } + + // find the position of the split + void* right_partition = reinterpret_cast(this) + size; + + size_t remaining_size = total_size(cache) - size; + + // Add the new block as a buddy + auto metadata = cache.load(this); + + // Write the metadata for the new block + auto new_block_right_buddy = metadata.right_buddy; + + cache.store( + static_cast(right_partition), + Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), + remaining_size, this, new_block_right_buddy)); + + metadata.right_buddy = static_cast(right_partition); + metadata.size = size - sizeof(Metadata); + metadata.total_size = size; + + cache.store(this, metadata); + + // Write metadata for the new block's right buddy + if (new_block_right_buddy != nullptr) { + auto buddy_metadata = cache.load(new_block_right_buddy); + + buddy_metadata.left_buddy = static_cast(right_partition); + + cache.store(new_block_right_buddy, buddy_metadata); + } +} + +void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { + // only free blocks can be merged + PADDLE_ASSERT(type(cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); + + auto metadata = cache.load(this); + + // link this->buddy's buddy + metadata.right_buddy = right_buddy->right_buddy(cache); + + // link buddy's buddy -> this + if (metadata.right_buddy != nullptr) { + auto buddy_metadata = cache.load(metadata.right_buddy); + + buddy_metadata.left_buddy = this; + + cache.store(metadata.right_buddy, buddy_metadata); + } + + metadata.size += right_buddy->total_size(cache); + metadata.total_size += right_buddy->total_size(cache); + + cache.store(this, metadata); + cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); +} + +void MemoryBlock::mark_as_free(MetadataCache& cache) { + // check for double free or corruption + PADDLE_ASSERT(type(cache) != FREE_CHUNK); + PADDLE_ASSERT(type(cache) != INVALID_CHUNK); + + set_type(cache, FREE_CHUNK); +} + +void MemoryBlock::set_type(MetadataCache& cache, Type t) { + auto metadata = cache.load(this); + + metadata.type = t; + + cache.store(this, metadata); +} + +bool MemoryBlock::has_left_buddy(MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +size_t MemoryBlock::index(MetadataCache& cache) const { + return cache.load(this).index; +} + +void* MemoryBlock::data() const { + return const_cast(reinterpret_cast(this)) + 1; +} + +MemoryBlock* MemoryBlock::metadata() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) - 1)); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h similarity index 100% rename from paddle/memory/detail/memory_block.h rename to paddle/fluid/memory/detail/memory_block.h diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d78811c7715b906aea1b88c13a4c3939db6387d --- /dev/null +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/meta_cache.h" +#include "glog/logging.h" +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} + +Metadata MetadataCache::load(const MemoryBlock* block) { + if (uses_gpu_) { + auto existing_metadata = cache_.find(block); + PADDLE_ASSERT(existing_metadata->second.check_guards()); + return existing_metadata->second; + } else { + auto* meta = reinterpret_cast(block); + VLOG(10) << "Load MetaData type=" << meta->type; + PADDLE_ASSERT(meta->check_guards()); + return *reinterpret_cast(block); + } +} + +void MetadataCache::store(MemoryBlock* block, + const Metadata& original_metadata) { + auto metadata = original_metadata; + + metadata.update_guards(); + + if (uses_gpu_) { + cache_[block] = metadata; + } else { + *reinterpret_cast(block) = metadata; + } +} + +void MetadataCache::invalidate(MemoryBlock* block) { + if (uses_gpu_) { + cache_.erase(block); + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..635d6398e697de80d0606a200c2634a93468199d --- /dev/null +++ b/paddle/fluid/memory/detail/meta_cache.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +/** + * \brief A cache for accessing memory block meta-data that may be expensive + * to access directly. + * + * \note This class exists to unify the metadata format between GPU and CPU + * allocations. It should be removed when the CPU can access all GPU + * allocations directly via UVM. + */ +class MetadataCache { + public: + explicit MetadataCache(bool uses_gpu); + + public: + /*! \brief Load the associated metadata for the specified memory block. */ + Metadata load(const MemoryBlock* memory_block); + + /*! \brief Store the associated metadata for the specified memory block. */ + void store(MemoryBlock* memory_block, const Metadata& meta_data); + + /*! \brief Indicate that the specified metadata will no longer be used. */ + void invalidate(MemoryBlock* memory_block); + + public: + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + private: + bool uses_gpu_; + + private: + typedef std::unordered_map MetadataMap; + + private: + MetadataMap cache_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/meta_data.cc new file mode 100644 index 0000000000000000000000000000000000000000..eae49ebdcffd03eeb192fb7e859666027336245b --- /dev/null +++ b/paddle/fluid/memory/detail/meta_data.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/meta_data.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) + : type(t), + index(i), + size(s), + total_size(ts), + left_buddy(l), + right_buddy(r) {} + +Metadata::Metadata() + : type(MemoryBlock::INVALID_CHUNK), + index(0), + size(0), + total_size(0), + left_buddy(nullptr), + right_buddy(nullptr) {} + +template +inline void hash_combine(std::size_t& seed, const T& v) { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +inline size_t hash(const Metadata* metadata, size_t initial_seed) { + size_t seed = initial_seed; + + hash_combine(seed, (size_t)metadata->type); + hash_combine(seed, metadata->index); + hash_combine(seed, metadata->size); + hash_combine(seed, metadata->total_size); + hash_combine(seed, metadata->left_buddy); + hash_combine(seed, metadata->right_buddy); + + return seed; +} + +void Metadata::update_guards() { + guard_begin = hash(this, 1); + guard_end = hash(this, 2); +} + +bool Metadata::check_guards() const { + return guard_begin == hash(this, 1) && guard_end == hash(this, 2); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h new file mode 100644 index 0000000000000000000000000000000000000000..368523701ef1a7b3bd869e1f0542c42c61448b40 --- /dev/null +++ b/paddle/fluid/memory/detail/meta_data.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/detail/memory_block.h" + +#include + +namespace paddle { +namespace memory { +namespace detail { + +class Metadata { + public: + Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + Metadata(); + + public: + /*! \brief Update the guards when metadata is changed */ + void update_guards(); + + /*! \brief Check consistency to previous modification */ + bool check_guards() const; + + public: + // TODO(gangliao): compress this + // clang-format off + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + // clang-format on +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f07c5e789c42997b3c75167a26a5b09875bd498 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" + +#include // for malloc and free +#include // for mlock and munlock +#include // for std::max + +#include "gflags/gflags.h" + +// If use_pinned_memory is true, CPUAllocator calls mlock, which +// returns pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the amount +// of memory available to the system for paging. So, by default, we +// should set false to use_pinned_memory. +DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +DECLARE_double(fraction_of_gpu_memory_to_use); +namespace paddle { +namespace memory { +namespace detail { + +void* CPUAllocator::Alloc(size_t& index, size_t size) { + // According to http://www.cplusplus.com/reference/cstdlib/malloc/, + // malloc might not return nullptr if size is zero, but the returned + // pointer shall not be dereferenced -- so we make it nullptr. + if (size <= 0) return nullptr; + + index = 0; // unlock memory + + void* p; + +#ifdef PADDLE_WITH_MKLDNN + // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp + // memory alignment + PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0); +#else + PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0); +#endif + PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + index = 1; + mlock(p, size); // lock memory + } + } + + return p; +} + +void CPUAllocator::Free(void* p, size_t size, size_t index) { + if (p != nullptr && index == 1) { + munlock(p, size); + } + free(p); +} + +bool CPUAllocator::UseGpu() const { return false; } + +#ifdef PADDLE_WITH_CUDA + +void* GPUAllocator::Alloc(size_t& index, size_t size) { + // CUDA documentation doesn't explain if cudaMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) return nullptr; + void* p; + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + index = 0; + gpu_alloc_size_ += size; + return p; + } else { + LOG(WARNING) + << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " + "environment variable to a lower value. Current value is " + << FLAGS_fraction_of_gpu_memory_to_use; + return nullptr; + } +} + +void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free."); + } +} + +bool GPUAllocator::UseGpu() const { return true; } + +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h similarity index 100% rename from paddle/memory/detail/system_allocator.h rename to paddle/fluid/memory/detail/system_allocator.h diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a850e480ec948b727980a8020df91958584aea02 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/system_allocator.h" + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_bool(use_pinned_memory); + +void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { + bool freed = false; + { + size_t index; + void* p = a.Alloc(index, size); + if (size > 0) { + EXPECT_NE(p, nullptr); + } else { + EXPECT_EQ(p, nullptr); + } + + int* i = static_cast(p); + std::shared_ptr ptr(i, [&](void* p) { + freed = true; + a.Free(p, size, index); + }); + } + EXPECT_TRUE(freed); +} + +TEST(CPUAllocator, NoLockMem) { + FLAGS_use_pinned_memory = false; + paddle::memory::detail::CPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} + +TEST(CPUAllocator, LockMem) { + FLAGS_use_pinned_memory = true; + paddle::memory::detail::CPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(GPUAllocator, Alloc) { + paddle::memory::detail::GPUAllocator a; + TestAllocator(a, 2048); + TestAllocator(a, 0); +} +#endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc new file mode 100644 index 0000000000000000000000000000000000000000..8938b3613373a06620a6a0237b3de773c6421edd --- /dev/null +++ b/paddle/fluid/memory/memcpy.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" + +#include // for memcpy + +namespace paddle { +namespace memory { + +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, + const void* src, size_t num) { + std::memcpy(dst, src, num); +} + +#ifdef PADDLE_WITH_CUDA +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + +#endif + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..77d209c3fbe8256bc94b3eca866f0f7e17a93325 --- /dev/null +++ b/paddle/fluid/memory/memcpy.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); + +#ifdef PADDLE_WITH_CUDA + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + cudaStream_t stream); + +#endif +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc new file mode 100644 index 0000000000000000000000000000000000000000..6eedab5d034192c071328b1be5c296227383287e --- /dev/null +++ b/paddle/fluid/memory/memory.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memory.h" + +#include "glog/logging.h" + +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); + +namespace paddle { +namespace memory { + +using BuddyAllocator = detail::BuddyAllocator; + +BuddyAllocator* GetCPUBuddyAllocator() { + static detail::BuddyAllocator* a = nullptr; + if (a == nullptr) { + a = new detail::BuddyAllocator(new detail::CPUAllocator, + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); + } + return a; +} + +template <> +void* Alloc(platform::CPUPlace place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void* p = GetCPUBuddyAllocator()->Alloc(size); + VLOG(10) << " pointer=" << p; + return p; +} + +template <> +void Free(platform::CPUPlace place, void* p) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(platform::CPUPlace place) { + return GetCPUBuddyAllocator()->Used(); +} + +#ifdef PADDLE_WITH_CUDA + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetCUDADeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = nullptr; + } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + return as[gpu_id]; +} + +template <> +size_t Used(platform::CUDAPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); +} + +template <> +void* Alloc(platform::CUDAPlace place, size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + auto* ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(avail, total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); + } + return ptr; +} + +template <> +void Free(platform::CUDAPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); +} + +#endif + +size_t Usage::operator()(const platform::CPUPlace& cpu) const { + return Used(cpu); +} + +size_t Usage::operator()(const platform::CUDAPlace& gpu) const { +#ifdef PADDLE_WITH_CUDA + return Used(gpu); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +size_t memory_usage(const platform::Place& p) { + return boost::apply_visitor(Usage(), p); +} + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h new file mode 100644 index 0000000000000000000000000000000000000000..a9166a6746e1985752ca18ffa7c429e5b35b55bb --- /dev/null +++ b/paddle/fluid/memory/memory.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Allocate memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] size Allocation size. + * + * \return Allocated memory block address. + * + * \note If return nullptr, it indicates memory allocation failed + * because insufficient memory in current system. When Alloc + * function is invoked, you must check the returned memory + * address is valid or not. + */ +template +void* Alloc(Place place, size_t size); + +/** + * \brief Free memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] ptr Memory block address to free. + * + */ +template +void Free(Place place, void* ptr); + +/** + * \brief Total size of used memory in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * + */ +template +size_t Used(Place place); + +struct Usage : public boost::static_visitor { + size_t operator()(const platform::CPUPlace& cpu) const; + size_t operator()(const platform::CUDAPlace& gpu) const; +}; + +size_t memory_usage(const platform::Place& p); + +/** + * \brief Free memory block in one place. + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PODDeleter { + static_assert(std::is_pod::value, "T must be POD"); + + public: + explicit PODDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + + private: + Place place_; +}; + +/** + * \brief Free memory block in one place does not meet POD + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PlainDeleter { + public: + explicit PlainDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } + + private: + Place place_; +}; + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d7505ef0f36bc8765ba7634f286b67bccc6eacb6 --- /dev/null +++ b/paddle/fluid/memory/memory_test.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/meta_data.h" + +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +#include +#include + +inline bool is_aligned(void const *p) { + return 0 == (reinterpret_cast(p) & 0x3); +} + +size_t align(size_t size, paddle::platform::CPUPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::CpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, CPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CPUPlace cpu; + p = paddle::memory::Alloc(cpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::platform::Place place = cpu; + EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place)); + + paddle::memory::Free(cpu, p); +} + +TEST(BuddyAllocator, CPUMultAlloc) { + paddle::platform::CPUPlace cpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(cpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(cpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(size, cpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(cpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(cpu) == total_size) continue; + + size_t aligned_size = align(p.second, cpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(cpu)); + } +} + +#ifdef PADDLE_WITH_CUDA + +size_t align(size_t size, paddle::platform::CUDAPlace place) { + size += sizeof(paddle::memory::detail::Metadata); + size_t alignment = paddle::platform::GpuMinChunkSize(); + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +TEST(BuddyAllocator, GPUAllocation) { + void *p = nullptr; + + EXPECT_EQ(p, nullptr); + + paddle::platform::CUDAPlace gpu(0); + p = paddle::memory::Alloc(gpu, 4096); + + EXPECT_NE(p, nullptr); + + paddle::platform::Place place = gpu; + EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place)); + + paddle::memory::Free(gpu, p); +} + +TEST(BuddyAllocator, GPUMultAlloc) { + paddle::platform::CUDAPlace gpu; + + std::unordered_map ps; + + size_t total_size = paddle::memory::Used(gpu); + EXPECT_EQ(total_size, 0UL); + + for (auto size : + {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + ps[paddle::memory::Alloc(gpu, size)] = size; + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(size, gpu); + total_size += aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } + + for (auto p : ps) { + EXPECT_EQ(is_aligned(p.first), true); + paddle::memory::Free(gpu, p.first); + + // Buddy Allocator doesn't manage too large memory chunk + if (paddle::memory::Used(gpu) == total_size) continue; + + size_t aligned_size = align(p.second, gpu); + total_size -= aligned_size; + EXPECT_EQ(total_size, paddle::memory::Used(gpu)); + } +} + +#endif diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/operators/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cadfd735d7b3feb473c308b04417f0a1e0f22249 --- /dev/null +++ b/paddle/fluid/operators/CMakeLists.txt @@ -0,0 +1,203 @@ +file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +set(DEPS_OPS "") +set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) +file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") +function(op_library TARGET) + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. + set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) + set(cc_srcs) + set(cu_srcs) + set(cu_cc_srcs) + set(op_common_deps operator op_registry math_function) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(pybind_flag 0) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + list(LENGTH op_library_SRCS op_library_SRCS_len) + if (${op_library_SRCS_len} EQUAL 0) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_cc_srcs ${TARGET}.cu.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + else() + foreach(src ${op_library_SRCS}) + if (${src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${src}) + elseif(${src} MATCHES ".*\\.cu.cc$") + list(APPEND cu_cc_srcs ${src}) + elseif(${src} MATCHES ".*\\.cc$") + list(APPEND cc_srcs ${src}) + else() + message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") + endif() + endforeach() + endif() + + list(LENGTH cc_srcs cc_srcs_len) + if (${cc_srcs_len} EQUAL 0) + message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") + endif() + + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + else() + cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + endif() + + # Define operators that don't need pybind here. + foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op") + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() + + # The registration of USE_OP, please refer to paddle/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. + file(READ ${TARGET}.cc TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}") + string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}") + if (one_register STREQUAL "") + string(REPLACE "_op" "" TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}") + string(REPLACE "," "" TARGET "${TARGET}") + endif() + + # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel + string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") + string(REPLACE "_op" "" TARGET "${TARGET}") + if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") + file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_CPU_ONLY_OP + list(LENGTH cu_srcs cu_srcs_len) + list(LENGTH cu_cc_srcs cu_cc_srcs_len) + if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) + file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_OP + if (${pybind_flag} EQUAL 0) + file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") + endif() +endfunction() + +add_subdirectory(math) +add_subdirectory(nccl) + +if(WITH_GPU) + op_library(nccl_op DEPS nccl_common) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") +else() + set(DEPS_OPS ${DEPS_OPS} nccl_op) +endif() + +if(WITH_DISTRIBUTE) + add_subdirectory(detail) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) +else() + set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) +endif() + +op_library(cond_op DEPS framework_proto tensor net_op) +op_library(cross_entropy_op DEPS cross_entropy) +op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) +op_library(softmax_op DEPS softmax) +op_library(detection_output_op DEPS softmax) +op_library(sequence_softmax_op DEPS softmax) +op_library(sum_op DEPS selected_rows_functor) +op_library(sgd_op DEPS selected_rows_functor) +op_library(print_op DEPS lod_tensor) +op_library(adagrad_op DEPS selected_rows_functor) +op_library(maxout_op DEPS maxouting) +op_library(unpool_op DEPS unpooling) +op_library(pool_with_index_op DEPS pooling) +op_library(lod_rank_table_op DEPS lod_rank_table) +op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +op_library(max_sequence_len_op DEPS lod_rank_table) +op_library(sequence_conv_op DEPS context_project) +op_library(sequence_pool_op DEPS sequence_pooling) +op_library(lstm_op DEPS sequence2batch lstm_compute) +op_library(lstmp_op DEPS sequence2batch lstm_compute) +op_library(gru_op DEPS sequence2batch gru_compute) +op_library(recurrent_op DEPS executor) +op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) +op_library(cos_sim_op DEPS cos_sim_functor) +op_library(parallel_do_op DEPS executor) +op_library(create_reader_op DEPS reader) + +# Regist multiple Kernel to pybind +if (WITH_GPU) + +op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS + vol2col depthwise_conv) + +op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) +op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc + conv_transpose_cudnn_op.cu.cc DEPS vol2col) +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n") +file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n") +else() +op_library(conv_op SRCS conv_op.cc DEPS vol2col) +op_library(pool_op SRCS pool_op.cc DEPS pooling) +op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col) +endif() + +# FIXME(typhoonzero): save/load depends lodtensor serialization functions +op_library(save_op DEPS lod_tensor) +op_library(load_op DEPS lod_tensor) +op_library(save_combine_op DEPS lod_tensor) +op_library(load_combine_op DEPS lod_tensor) + +list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) +foreach(src ${GENERAL_OPS}) + op_library(${src}) +endforeach() +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n") + +set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") + +cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) +cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) +cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) +cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) +cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) +if(WITH_GPU) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +endif() +cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) diff --git a/paddle/fluid/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..43689b3b7da5a0f5157ec8bc5fcf19d643ddc4ca --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/accuracy_op.h" + +namespace paddle { +namespace operators { + +class AccuracyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Input (Out) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input (Indices) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input (Label) of accuracy op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Accuracy"), + "Output (Accuracy) of AccuracyOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Correct"), + "Output (Correct) of AccuracyOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Total"), + "Output (Total) of AccuracyOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Out"); + auto label_dim = ctx->GetInputDim("Label"); + // Assume indices has same shape as inference, because + // it's the output of topk. + + PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2."); + PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1"); + PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0], + "the inference tensor's num_rows must be" + " the same as label."); + + ctx->SetOutputDim("Accuracy", {1}); + ctx->SetOutputDim("Correct", {1}); + ctx->SetOutputDim("Total", {1}); + ctx->ShareLoD("Out", /*->*/ "Accuracy"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Out")->type()), + ctx.GetPlace()); + } +}; + +class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + // TODO(typhoonzero): support both inference value and indices. + AddInput("Out", "The network output of topk (inferences)"); + AddInput("Indices", "The the network output of topk (indices)"); + AddInput("Label", "Label of the training data"); + // TODO(typhoonzero): AddInput("Weight", ... + AddOutput("Accuracy", "The accuracy of current batch"); + AddOutput("Correct", "The correct samples count of current batch"); + AddOutput("Total", "The samples count of current batch"); + + AddComment(R"DOC( +Accuracy Operator. + +It will print accuracy rate for classification. +The accuracy is calculated as follows: + +$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$ + +Both the input Out and Label can carry the LoD (Level of Details) +information, or not. But the output only shares the LoD information +with the input Out(Inference). + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, + paddle::framework::EmptyGradOpMaker); +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. +REGISTER_OP_CPU_KERNEL(accuracy, + ops::AccuracyKernel, + ops::AccuracyKernel); diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4462b9ba5c0e902933c53130f72fe40f807bde4a --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.cu @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/operators/accuracy_op.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void AccuracyCudaKernel(const int N, const int D, + const int64_t* Xdata, + const int64_t* labeldata, int* correct_data, + float* accuracy, int* total_data) { + int count = 0; + __shared__ int total[BlockSize]; + + // support only 1 block + for (int i = threadIdx.x; i < (N); i += BlockSize) { + for (int j = 0; j < D; ++j) { + if (Xdata[i * D + j] == labeldata[i]) { + ++count; + break; + } + } + } + total[threadIdx.x] = count; + __syncthreads(); + + // reduce the count with init value 0, and output accuracy. + int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); + if (threadIdx.x == 0) { + *correct_data = result; + *accuracy = static_cast(result) / static_cast(N); + *total_data = N; + } +} + +template +class AccuracyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* inference = ctx.Input("Out"); + auto* indices = ctx.Input("Indices"); + auto* label = ctx.Input("Label"); + + auto* accuracy = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + // FIXME(typhoonzero): only support indices currently + // if add support for output values, how to detect the data type? + const int64_t* indices_data = indices->data(); + const int64_t* label_data = label->data(); + + int* correct_data = correct->mutable_data(ctx.GetPlace()); + int* total_data = total->mutable_data(ctx.GetPlace()); + float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); + + int num_samples = static_cast(inference->dims()[0]); + size_t infer_width = inference->dims()[1]; + auto stream = ctx.cuda_device_context().stream(); + platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); + + if (num_samples == 0) { + return; + } + + AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_samples, infer_width, indices_data, label_data, correct_data, + accuracy_data, total_data); + } +}; + +} // namespace operators +} // namespace paddle + +// FIXME(typhoonzero): types of T is for inference data. +// label data is always int64 +REGISTER_OP_CUDA_KERNEL(accuracy, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/accuracy_op.h b/paddle/fluid/operators/accuracy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b3ed1d3fe09ba044142ed69a463918d7e03a78e9 --- /dev/null +++ b/paddle/fluid/operators/accuracy_op.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AccuracyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* inference = ctx.Input("Out"); + auto* indices = ctx.Input("Indices"); + auto* label = ctx.Input("Label"); + auto* accuracy = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + + int* correct_data = correct->mutable_data(ctx.GetPlace()); + int* total_data = total->mutable_data(ctx.GetPlace()); + float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); + + const int64_t* indices_data = indices->data(); + const int64_t* label_data = label->data(); + + size_t num_samples = inference->dims()[0]; + size_t class_dim = inference->dims()[1]; + *accuracy_data = 0.0f; + + if (num_samples == 0) { + return; + } + + int num_correct = 0; + // assume inference is already the topk of the output + for (size_t i = 0; i < num_samples; ++i) { + PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0"); + for (size_t j = 0; j < class_dim; ++j) { + if (indices_data[i * class_dim + j] == label_data[i]) { + ++num_correct; + break; + } + } + } + + *correct_data = num_correct; + *total_data = num_samples; + *accuracy_data = + static_cast(num_correct) / static_cast(num_samples); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c04dd8cb9163cf7b05fd09bcf7f1d2937368614f --- /dev/null +++ b/paddle/fluid/operators/activation_op.cc @@ -0,0 +1,615 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/activation_op.h" + +namespace paddle { +namespace operators { + +class ActivationOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ActivationOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Sigmoid operator"); + AddOutput("Out", "Output of Sigmoid operator"); + AddComment(R"DOC( +Sigmoid Activation Operator + +$$out = \frac{1}{1 + e^{-x}}$$ + +)DOC"); + } +}; + +class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of LogSigmoid operator"); + AddOutput("Out", "Output of LogSigmoid operator"); + AddComment(R"DOC( +Logsigmoid Activation Operator + +$$out = \log \frac{1}{1 + e^{-x}}$$ + +)DOC"); + } +}; + +class ExpOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Exp operator"); + AddOutput("Out", "Output of Exp operator"); + AddComment(R"DOC( +Exp Activation Operator. + +$out = e^x$ + +)DOC"); + } +}; + +class ReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Relu operator"); + AddOutput("Out", "Output of Relu operator"); + AddComment(R"DOC( +Relu Activation Operator. + +$out = \max(x, 0)$ + +)DOC"); + } +}; + +class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of LeakyRelu operator"); + AddOutput("Out", "Output of LeakyRelu operator"); + AddAttr("alpha", "The small negative slope").SetDefault(0.02f); + AddComment(R"DOC( +LeakyRelu Activation Operator. + +$out = \max(x, \alpha * x)$ + +)DOC"); + } +}; + +class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softshrink operator"); + AddOutput("Out", "Output of Softshrink operator"); + AddAttr("lambda", "non-negative offset").SetDefault(0.5f); + AddComment(R"DOC( +Softshrink Activation Operator. + +$$ +out = \begin{cases} + x - \lambda, \text{if } x > \lambda \\ + x + \lambda, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class TanhOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Tanh operator"); + AddOutput("Out", "Output of Tanh operator"); + AddComment(R"DOC( +Tanh Activation Operator. + +$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"); + } +}; + +class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of TanhShrink operator"); + AddOutput("Out", "Output of TanhShrink operator"); + AddComment(R"DOC( +TanhShrink Activation Operator. + +$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"); + } +}; + +class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardShrink operator"); + AddOutput("Out", "Output of HardShrink operator"); + AddAttr("threshold", "The value of threshold for HardShrink") + .SetDefault(0.5f); + AddComment(R"DOC( +HardShrink Activation Operator. + +$$ +out = \begin{cases} + x, \text{if } x > \lambda \\ + x, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Sqrt operator"); + AddOutput("Out", "Output of Sqrt operator"); + AddComment(R"DOC( +Sqrt Activation Operator. + +$out = \sqrt{x}$ + +)DOC"); + } +}; + +class AbsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Abs operator"); + AddOutput("Out", "Output of Abs operator"); + AddComment(R"DOC( +Abs Activation Operator. + +$out = |x|$ + +)DOC"); + } +}; + +class CeilOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Ceil operator"); + AddOutput("Out", "Output of Ceil operator"); + AddComment(R"DOC( +Ceil Activation Operator. + +$out = ceil(x)$ + +)DOC"); + } +}; + +class FloorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Floor operator"); + AddOutput("Out", "Output of Floor operator"); + AddComment(R"DOC( +Floor Activation Operator. + +$out = floor(x)$ + +)DOC"); + } +}; + +class RoundOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Round operator"); + AddOutput("Out", "Output of Round operator"); + AddComment(R"DOC( +Round Activation Operator. + +$out = [x]$ + +)DOC"); + } +}; + +class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Reciprocal operator"); + AddOutput("Out", "Output of Reciprocal operator"); + AddComment(R"DOC( +Reciprocal Activation Operator. + +$$out = \frac{1}{x}$$ + +)DOC"); + } +}; + +class LogOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Log operator"); + AddOutput("Out", "Output of Log operator"); + AddComment(R"DOC( +Log Activation Operator. + +$out = \ln(x)$ + +Natural logarithm of x. + +)DOC"); + } +}; + +class SquareOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Square operator"); + AddOutput("Out", "Output of Square operator"); + AddComment(R"DOC( +Square Activation Operator. + +$out = x^2$ + +)DOC"); + } +}; + +class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softplus operator"); + AddOutput("Out", "Output of Softplus operator"); + AddComment(R"DOC( +Softplus Activation Operator. + +$out = \ln(1 + e^{x})$ + +)DOC"); + } +}; + +class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softsign operator"); + AddOutput("Out", "Output of Softsign operator"); + AddComment(R"DOC( +Softsign Activation Operator. + +$$out = \frac{x}{1 + |x|}$$ + +)DOC"); + } +}; + +class BReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of BRelu operator"); + AddOutput("Out", "Output of BRelu operator"); + AddAttr("t_min", "The min marginal value of BRelu") + .SetDefault(static_cast(0)); + AddAttr("t_max", "The max marginal value of BRelu") + .SetDefault(static_cast(24)); + AddComment(R"DOC( +BRelu Activation Operator. + +$out = \max(\min(x, t_{min}), t_{max})$ + +)DOC"); + } +}; + +class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of SoftRelu operator"); + AddOutput("Out", "Output of SoftRelu operator"); + AddAttr("threshold", "The threshold value of SoftRelu") + .SetDefault(40.0f); + AddComment(R"DOC( +SoftRelu Activation Operator. + +$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$ + +)DOC"); + } +}; + +class ELUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of ELU operator"); + AddOutput("Out", "Output of ELU operator"); + AddAttr("alpha", "The alpha value of ELU").SetDefault(1.0f); + AddComment(R"DOC( +ELU Activation Operator. + +Applies the following element-wise computation on the input according to +https://arxiv.org/abs/1511.07289. + +$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$ + +)DOC"); + } +}; + +class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { + public: + Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Relu6 operator"); + AddOutput("Out", "Output of Relu6 operator"); + AddAttr("threshold", "The threshold value of Relu6") + .SetDefault(6.0f); + AddComment(R"DOC( +Relu6 Activation Operator. + +$out = \min(\max(0, x), 6)$ + +)DOC"); + } +}; + +class PowOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PowOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Pow operator"); + AddOutput("Out", "Output of Pow operator"); + AddAttr("factor", "The exponential factor of Pow").SetDefault(1.0f); + AddComment(R"DOC( +Pow Activation Operator. + +$out = x^{factor}$ + +)DOC"); + } +}; + +class STanhOpMaker : public framework::OpProtoAndCheckerMaker { + public: + STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of STanh operator"); + AddOutput("Out", "Output of STanh operator"); + AddAttr("scale_a", "The scale parameter of a for the input") + .SetDefault(2.0f / 3.0f); + AddAttr("scale_b", "The scale parameter of b for the input") + .SetDefault(1.7159f); + AddComment(R"DOC( +STanh Activation Operator. + +$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ + +)DOC"); + } +}; + +class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of ThresholdedRelu operator"); + AddOutput("Out", "Output of ThresholdedRelu operator"); + AddAttr("threshold", "The threshold location of activation") + .SetDefault(1.0f); + AddComment(R"DOC( +ThresholdedRelu Activation Operator. + +$$ +out = \begin{cases} + x, \text{if } x > threshold \\ + 0, \text{otherwise} + \end{cases} +$$ + +)DOC"); + } +}; + +class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardSigmoid operator"); + AddOutput("Out", "Output of HardSigmoid operator"); + AddAttr("slope", "Slope for linear approximation of sigmoid") + .SetDefault(0.2f); + AddAttr("offset", "Offset for linear approximation of sigmoid") + .SetDefault(0.5f); + AddComment(R"DOC( +HardSigmoid Activation Operator. + +Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), +which is much faster than sigmoid. + +$out = \max(0, \min(1, slope * x + shift))$ + +The slope should be positive. The offset can be either positive or negative. +The default slope and shift are set according to the above reference. +It is recommended to use the defaults for this activation. + +)DOC"); + } +}; + +class SwishOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Swish operator"); + AddOutput("Out", "Output of Swish operator"); + AddAttr("beta", "Constant beta of swish operator").SetDefault(1.0f); + AddComment(R"DOC( +Swish Activation Operator. + +$$out = \frac{x}{1 + e^{- \beta x}}$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad, + ops::ActivationOpGrad); + +REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, + logsigmoid_grad, ops::ActivationOpGrad); + +REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, + ops::ActivationOpGrad); + +REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, + ops::ActivationOpGrad); + +REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, + tanh_shrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, + softshrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, + ops::ActivationOpGrad); + +REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, + ops::ActivationOpGrad); + +REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, + ops::ActivationOpGrad); + +REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad, + ops::ActivationOpGrad); + +REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad, + ops::ActivationOpGrad); + +REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker, + reciprocal_grad, ops::ActivationOpGrad); + +REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad, + ops::ActivationOpGrad); + +REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad, + ops::ActivationOpGrad); + +REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad, + ops::ActivationOpGrad); + +REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad, + ops::ActivationOpGrad); + +REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker, + leaky_relu_grad, ops::ActivationOpGrad); + +REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad, + ops::ActivationOpGrad); + +REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad, + ops::ActivationOpGrad); + +REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad, + ops::ActivationOpGrad); + +REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad, + ops::ActivationOpGrad); + +REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker, + hard_shrink_grad, ops::ActivationOpGrad); + +REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker, + thresholded_relu_grad, ops::ActivationOpGrad); + +REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, + hard_sigmoid_grad, ops::ActivationOpGrad); + +REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, + ops::ActivationOpGrad); + +#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL( \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b86a7926a978b987d9dbb51fba55e025aab5e7fd --- /dev/null +++ b/paddle/fluid/operators/activation_op.cu @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/activation_op.h" + +namespace ops = paddle::operators; + +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7a6ae2224c84cc17223f43046f06f08d11451439 --- /dev/null +++ b/paddle/fluid/operators/activation_op.h @@ -0,0 +1,799 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class ActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + Out.mutable_data(context.GetPlace()); + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out); + } +}; + +template +class ActivationGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Input("Out"); + auto* dOut = + context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto dout = framework::EigenVector::Flatten(*dOut); + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenVector::Flatten(*Out); + auto dx = framework::EigenVector::Flatten(*dX); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out, dout, dx); + } +}; + +template +struct BaseActivationFunctor { + using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } +}; + +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } +}; + +// exp(x) = e^x +template +struct ExpFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.exp(); + } +}; + +template +struct ExpGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out; + } +}; + +// relu(x) = max(x, 0) +template +struct ReluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)); + } +}; + +template +struct ReluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x > static_cast(0)).template cast(); + } +}; + +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + out.device(d) = x * (temp1 + temp2); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +// sqrt(x) = x^(1/2) +template +struct SqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.sqrt(); + } +}; + +template +struct SqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + const Out out_conj = Eigen::numext::conj(out); + dx.device(d) = static_cast(0.5) * dout / out_conj; + } +}; + +// ceil(x) = ceiling(x) +template +struct CeilFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.ceil(); + } +}; + +template +struct ZeroGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(0) / x; + } +}; + +// floor(x) = flooring(x) +template +struct FloorFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.floor(); + } +}; + +// round(x) = [x] +template +struct RoundFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.round(); + } +}; + +// abs(x) = |x| +template +struct AbsFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.abs(); + } +}; + +template +struct AbsGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.sign(); + } +}; + +// reciprocal(x) = 1 / x +template +struct ReciprocalFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / x; + } +}; + +template +struct ReciprocalGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(-1) * out * out; + } +}; + +// log(x) = natural logarithm of x +template +struct LogFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.log(); + } +}; + +template +struct LogGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) / x); + } +}; + +// square(x) = x^2 +template +struct SquareFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.square(); + } +}; + +template +struct SquareGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(2) * x; + } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } +}; + +// relu6(x) = min(max(0, x), 6) +template +struct Relu6Functor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(0)).cwiseMin(static_cast(threshold)); + } +}; + +template +struct Relu6GradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(0)) * (x < static_cast(threshold))) + .template cast(); + } +}; + +// softplus(x) = log(1 + exp(x)) +// When x is a very large positive number, exp(x) may explode to inf, +// Using trick below for numerical stability +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0))) +template +struct SoftplusFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log()); + } +}; + +// d(softplus(x))/dx = exp(x) / (1 + exp(x)) +// For numerical stability: +// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) + +// exp(x - max(x, 0))) +template +struct SoftplusGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + dx.device(d) = + dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp())); + } +}; + +// softsign(x) = x / (1 + |x|) +template +struct SoftsignFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + out.device(d) = x / (static_cast(1) + x.abs()); + } +}; + +// d(softsign(x))/dx = 1 / (1 + |x|)^2 +// Taken from https://en.wikipedia.org/wiki/Activation_function +template +struct SoftsignGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + dx.device(d) = + dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); + } +}; + +template +struct SoftReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto tmp = static_cast(threshold); + auto temp = x.cwiseMax(-tmp).cwiseMin(tmp); + out.device(d) = (static_cast(1) + temp.exp()).log(); + } +}; + +template +struct SoftReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto tmp = static_cast(threshold); + auto temp = ((x > -tmp) * (x < tmp)).template cast().eval(); + dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; + } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(alpha) * + (x < static_cast(0)).template cast().eval(); + auto temp2 = (x >= static_cast(0)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)) + + (static_cast(alpha) * (x.exp() - static_cast(1))) + .cwiseMin(static_cast(0)); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x > static_cast(0)).template cast() + + dout * (out + static_cast(alpha)) * + (x < static_cast(0)).template cast(); + } +}; + +// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 +template +struct PowFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.pow(static_cast(factor)); + } +}; + +template +struct PowGradFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(factor) * + x.pow(static_cast(factor - static_cast(1))); + } +}; + +template +struct STanhFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); + } +}; + +template +struct STanhGradFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto a = static_cast(scale_a); + auto b = static_cast(scale_b); + auto temp = (a * x).tanh() * (a * x).tanh(); + dx.device(d) = dout * a * b * (static_cast(1) - temp); + } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } +}; + +template +struct SwishFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x / (static_cast(1) + (static_cast(-beta) * x).exp()); + } +}; + +template +struct SwishGradFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) / + (static_cast(1) + (static_cast(-beta) * x).exp()); + auto temp2 = temp1 * (static_cast(1) - (beta * out)); + dx.device(d) = dout * ((beta * out) + temp2); + } +}; + +} // namespace operators +} // namespace paddle + +#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ + __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(exp, ExpFunctor, ExpGradFunctor); \ + __macro(relu, ReluFunctor, ReluGradFunctor); \ + __macro(tanh, TanhFunctor, TanhGradFunctor); \ + __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ + __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ + __macro(abs, AbsFunctor, AbsGradFunctor); \ + __macro(ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log, LogFunctor, LogGradFunctor); \ + __macro(square, SquareFunctor, SquareGradFunctor); \ + __macro(brelu, BReluFunctor, BReluGradFunctor); \ + __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(pow, PowFunctor, PowGradFunctor); \ + __macro(stanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ + __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ + __macro(elu, ELUFunctor, ELUGradFunctor); \ + __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \ + __macro(swish, SwishFunctor, SwishGradFunctor); \ + __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ececd47e6a6787a161405fec75dafda336fddfbf --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adadelta_op.h" + +namespace paddle { +namespace operators { + +class AdadeltaOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"), + "Input(AvgSquaredGrad) of AdadeltaOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), + "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AvgSquaredGradOut"), + "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AvgSquaredUpdateOut"), + "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "param and grad input of AdadeltaOp should have same dimension"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), + "Param and AvgSquaredGrad input of AdadeltaOp " + "should have same dimension"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), + "Param and AvgSquaredUpdate input of AdadeltaOp " + "should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("AvgSquaredGradOut", param_dim); + ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); + } +}; + +class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient"); + AddInput("AvgSquaredUpdate", + "(Tensor) Input average of squared parameter updates"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("AvgSquaredGradOut", + "(Tensor) Output average of squared gradient"); + AddOutput("AvgSquaredUpdateOut", + "(Tensor) Output average of squared parameter updates"); + + AddAttr("rho", + "(float, default 0.95) Exponential decay rate " + "for squared gradients.") + .SetDefault(0.95f); + AddAttr("epsilon", + "(float, default 1.0e-6) Constant for " + "numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( +Adadelta Optimizer. + +Adadelta optimizer is implemented as explained in: +https://arxiv.org/abs/1212.5701 +Adadelta is a per-dimension adaptive learning rate method used +for gradient descent. + +Adadelta updates are as follows: + +$$ +avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\ +param\_update = - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\ +avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\ +param\_out = param + param\_update +$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); +REGISTER_OP_CPU_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/fluid/operators/adadelta_op.cu b/paddle/fluid/operators/adadelta_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..733482f788df8dfe1224ebe0d4494111bf9f647b --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adadelta_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h new file mode 100644 index 0000000000000000000000000000000000000000..82ced08710448293fb91a4fb0dea7ab216cd3da6 --- /dev/null +++ b/paddle/fluid/operators/adadelta_op.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdadeltaOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto avg_squared_grad_out_tensor = + ctx.Output("AvgSquaredGradOut"); + auto avg_squared_update_out_tensor = + ctx.Output("AvgSquaredUpdateOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); + avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); + + T rho = static_cast(ctx.Attr("rho")); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + // Squared gradient accumulator + auto avg_squared_grad = framework::EigenVector::Flatten( + *ctx.Input("AvgSquaredGrad")); + // Squared updates accumulator + auto avg_squared_update = framework::EigenVector::Flatten( + *ctx.Input("AvgSquaredUpdate")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto avg_squared_grad_out = + framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); + auto avg_squared_update_out = + framework::EigenVector::Flatten(*avg_squared_update_out_tensor); + auto& place = *ctx.template device_context().eigen_device(); + + avg_squared_grad_out.device(place) = + rho * avg_squared_grad + (1 - rho) * grad.square(); + auto update = + -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) + .sqrt() * + grad; + avg_squared_update_out.device(place) = + rho * avg_squared_update + (1 - rho) * update.square(); + param_out.device(place) = param + update; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..61c0ecd019b1d7811ee5cfd4b43358bdb0fba3d9 --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adagrad_op.h" + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +class AdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdagradOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of AdagradOp should have the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( + +Adaptive Gradient Algorithm (Adagrad). + +The update is done as follows: + +$$moment\_out = moment + grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} +$$ + +The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) +does not have the epsilon attribute. It is added here in our implementation +as also proposed here: http://cs231n.github.io/neural-networks-3/#ada +for numerical stability to avoid the division by zero error. + +)DOC"); + } +}; + +namespace { +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_width = grad.value().dims()[1]; + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto& merge_rows = grad_merge.rows(); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + + // 2. m += g_m * g_m + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); + + math::SelectedRowsAddToTensor functor; + functor(context, grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + for (size_t i = 0; i < merge_rows.size(); i++) { + for (int64_t j = 0; j < grad_width; j++) { + param_data[merge_rows[i] * grad_width + j] -= + lr[0] * grad_merge_data[i * grad_width + j] / + (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon); + } + } + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1117363c133fe02c0c6b0a563d0b3665efb7fb18 --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.cu @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adagrad_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows, + T* grad_merge, const int64_t* grad_merge_rows, + size_t grad_merge_rows_size, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t grad_merge_idx; + + if (tid == 0) { + for (size_t i = 0; i < grad_merge_rows_size; i++) { + if (grad_rows[ty] == grad_merge_rows[i]) { + grad_merge_idx = i; + } + } + } + + __syncthreads(); + + grad += ty * row_numel; + grad_merge += grad_merge_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]); + } +} + +template +__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, + const T* learning_rate, T* param, + T* moment, int64_t row_numel, + T epsilon) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + grad += ty * row_numel; + param += rows[ty] * row_numel; + moment += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(param + index, + -1.0 * learning_rate[0] * grad[index] / + (sqrt(moment[index]) + epsilon)); + } +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_width = grad.value().dims()[1]; + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + framework::Vector merge_rows(grad_merge.rows()); + // 2. m += g_m * g_m + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); + + math::SelectedRowsAddToTensor functor; + functor(context, grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid2(1, merge_rows.size()); + SparseAdagradFunctorKernel< + T, 256><<(context) + .stream()>>>( + grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr, + param_data, moment_data, grad_width, epsilon); + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ee503b2c36299c7550f6679fe6e4bca7c33c8eee --- /dev/null +++ b/paddle/fluid/operators/adagrad_op.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +struct SparseAdagradFunctor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param); +}; + +template +class AdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out_tensor = ctx.Output("ParamOut"); + auto* moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto* learning_rate = ctx.Input("LearningRate"); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto* place = ctx.template device_context().eigen_device(); + + moment_out.device(*place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + if (platform::is_cpu_place(ctx.GetPlace())) { + auto* lr = learning_rate->data(); + param_out.device(*place) = + param - lr[0] * grad / (moment_out.sqrt() + epsilon); + } else { + auto lr = framework::EigenVector::Flatten(*learning_rate); + param_out.device(*place) = + param - + lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } + } else if (grad_var->IsType()) { + auto* param_tensor = ctx.Input("Param"); + PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); + + auto* moment_tensor = ctx.Input("Moment"); + PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); + + SparseAdagradFunctor functor; + functor(ctx.template device_context(), + *ctx.Input("Grad"), + *ctx.Input("LearningRate"), epsilon, + moment_out_tensor, param_out_tensor); + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..25da9336b28ca8d14bf01ce8ca13bd8b379e9b10 --- /dev/null +++ b/paddle/fluid/operators/adam_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adam_op.h" + +namespace paddle { +namespace operators { + +class AdamOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + "Beta2 power accumulator should have 1 dimension"); + + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment1 input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and Moment2 input of AdamOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("Moment1Out", param_dims); + ctx->SetOutputDim("Moment2Out", param_dims); + } +}; + +class AdamOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment1", "(Tensor) Input first moment"); + AddInput("Moment2", "(Tensor) Input second moment"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); + AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("Moment1Out", "(Tensor) Output first moment"); + AddOutput("Moment2Out", "(Tensor) Output second moment"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "first moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the " + "second moment estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); + + AddComment(R"DOC( +Adam Optimizer. + +This implements the Adam optimizer from Section 2 of the Adam +paper : https://arxiv.org/abs/1412.6980. +Adam is a first-order gradient-based optimization method based on +adaptive estimates of lower-order moments. + +Adam updates: + +$$ +moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\ +moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\ +learning\_rate = learning\_rate * + \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/fluid/operators/adam_op.cu b/paddle/fluid/operators/adam_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..85b806eb6a1c0ed28cd47331786b66fc2e3a21eb --- /dev/null +++ b/paddle/fluid/operators/adam_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adam_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a51b46ef15778cf83d4d4f9c2d8f366b1c5d6b9f --- /dev/null +++ b/paddle/fluid/operators/adam_op.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include // for sqrt in CPU and CUDA +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +namespace scatter = paddle::operators::math::scatter; + +template +struct AdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + inline HOSTDEVICE void operator()(size_t i) const { + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } +}; + +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel) {} + + inline HOSTDEVICE void operator()(size_t i) const { + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + for (int64_t j = 0; j < row_numel_; ++j) { + T g = grad_[i * row_numel_ + j]; + T mom1 = moment1_[rows_[i] * row_numel_ + j]; + T mom2 = moment2_[rows_[i] * row_numel_ + j]; + T lr = *lr_; + T p = param_[rows_[i] * row_numel_ + j]; + + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + moment1_out_[rows_[i] * row_numel_ + j] = mom1; + moment2_out_[rows_[i] * row_numel_ + j] = mom2; + param_out_[rows_[i] * row_numel_ + j] = p; + } // for col id + } +}; + +template +class AdamOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using paddle::framework::LoDTensor; + using paddle::operators::detail::Ref; + + T beta1 = static_cast(ctx.Attr("beta1")); + T beta2 = static_cast(ctx.Attr("beta2")); + T epsilon = static_cast(ctx.Attr("epsilon")); + auto& param = Ref(ctx.Input("Param"), "Must set Param"); + // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto* grad_var = ctx.InputVar("Grad"); + auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); + auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); + auto& lr = + Ref(ctx.Input("LearningRate"), "Must set LearningRate"); + + auto& beta1_pow = + Ref(ctx.Input("Beta1Pow"), "Must set Beta1Pow"); + auto& beta2_pow = + Ref(ctx.Input("Beta2Pow"), "Must set Beta2Pow"); + + auto& param_out = + Ref(ctx.Output("ParamOut"), "Must set ParamOut"); + auto& mom1_out = + Ref(ctx.Output("Moment1Out"), "Must set Moment1Out"); + auto& mom2_out = + Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); + + if (grad_var->IsType()) { + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } else if (grad_var->IsType()) { + auto& grad = + Ref(ctx.Input("Grad"), "Must set Grad"); + // merge duplicated rows if any. + scatter::MergeAdd merge_func; + auto grad_merge = + merge_func(ctx.template device_context(), grad); + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + int64_t* rows = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); + } else { + rows = grad_merge.mutable_rows()->data(); + } + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); + platform::ForRange for_range( + static_cast(ctx.device_context()), + grad_merge.rows().size()); + for_range(functor); + } else { + PADDLE_THROW("Variable type not supported by adam_op"); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2249b8f96da86438748ab5b2b0f748cc590b8f7 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/adamax_op.h" + +namespace paddle { +namespace operators { + +class AdamaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InfNorm"), + "Input(InfNorm) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamaxOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), + "Output(InfNormOut) of AdamaxOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("InfNorm"), + "Param and InfNorm input of AdamaxOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + ctx->SetOutputDim("InfNormOut", param_dims); + } +}; + +class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment", "(Tensor) First moment"); + AddInput("InfNorm", + "(Tensor) " + "Input exponentially weighted infinity norm"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output first moment"); + AddOutput("InfNormOut", + "(Tensor) " + "Output exponentially weighted infinity norm"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "1st moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the weighted " + "infinity norm estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); + AddComment(R"DOC( +Adamax Optimizer. + +We implement the Adamax optimizer from Section 7 of the Adam +paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the +Adam algorithm based on the infinity norm. + +Adamax updates: + +$$ +moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\ +inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\ +learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out} +$$ + +The original paper does not have an epsilon attribute. +However, it is added here for numerical stability to prevent the +division by 0 error. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); +REGISTER_OP_CPU_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/fluid/operators/adamax_op.cu b/paddle/fluid/operators/adamax_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..44a5d6c7bdeac94ceb710d981c6445c046528cb0 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/adamax_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..124453c0eceb4caa53bf63a8d9e8c4b90a2213c9 --- /dev/null +++ b/paddle/fluid/operators/adamax_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdamaxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + auto inf_norm_out_tensor = ctx.Output("InfNormOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + inf_norm_out_tensor->mutable_data(ctx.GetPlace()); + + T beta1 = static_cast(ctx.Attr("beta1")); + T beta2 = static_cast(ctx.Attr("beta2")); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto inf_norm = framework::EigenVector::Flatten( + *ctx.Input("InfNorm")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + auto beta1_pow = framework::EigenVector::Flatten( + *ctx.Input("Beta1Pow")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto inf_norm_out = + framework::EigenVector::Flatten(*inf_norm_out_tensor); + auto* place = ctx.template device_context().eigen_device(); + + moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(*place) = + grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); + auto lr_t = lr / (1 - beta1_pow); + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(*place) = + param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..4ffb414ecea350006e5a370a0b25ae304cace89c --- /dev/null +++ b/paddle/fluid/operators/array_operator.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class ArrayOp : public framework::OperatorBase { + public: + ArrayOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + size_t GetOffset(const framework::Scope &scope, + const platform::Place &place) const { + auto *i = scope.FindVar(Input("I")); + PADDLE_ENFORCE(i != nullptr, "I must be set"); + auto &i_tensor = i->Get(); + PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + size_t offset; + if (platform::is_gpu_place(i_tensor.place())) { + // FIXME: Avoid copy from GPU to CPU + framework::Tensor t; + framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t); + dev_ctx.Wait(); + offset = static_cast(*t.data()); + } else { + offset = static_cast(*i_tensor.data()); + } + VLOG(10) << " Offset = " << offset; + return offset; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf8e11bd8c047275fe341ead9424d02e98d5d8f4 --- /dev/null +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -0,0 +1,177 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +using LoD = framework::LoD; + +class ArrayToLoDTensorOp : public framework::OperatorBase { + public: + ArrayToLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &rank_table = + scope.FindVar(Input("RankTable"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + + // Check dims, place and data type of input's elements and infer output's + // dim + PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); + int rank = x[0].dims().size(); + platform::Place place = x[0].place(); + std::type_index data_type = x[0].type(); + framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank); + int64_t batch_size = x[0].dims()[0]; + for (size_t i = 1; i < x.size(); ++i) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims, + "The dimension of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place), + "The place class of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(x[i].type() == data_type, + "The date type of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + batch_size += x[i].dims()[0]; + } + auto ins_dim_vec = framework::vectorize(ins_dims); + ins_dim_vec.insert(ins_dim_vec.begin(), batch_size); + framework::DDim out_dims = framework::make_ddim(ins_dim_vec); + out->Resize(out_dims); + out->mutable_data(place, data_type); + + auto &table_items = rank_table.items(); + std::vector table_item_idx(table_items.size()); + // table_item_idx = range(table_items_idx.size()) + std::iota(table_item_idx.begin(), table_item_idx.end(), 0); + std::sort(table_item_idx.begin(), table_item_idx.end(), + [&](size_t a, size_t b) { + return table_items[a].index < table_items[b].index; + }); + + // Build LoDTensor `out` + framework::LoD *out_lod = out->mutable_lod(); + out_lod->clear(); + size_t out_offset = 0; + auto prefix_lod = rank_table.coarse_lod(); + prefix_lod.emplace_back(); + auto &cur_level_lod = prefix_lod.back(); + cur_level_lod.push_back(0); + for (size_t idx : table_item_idx) { + cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length); + for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) { + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x[x_idx].lod(), idx, idx + 1, 0); + + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(out_lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " [" + << ", " << end_offset << "]"; + // Copy data + PADDLE_ENFORCE_GE(end_offset, start_offset); + size_t len = end_offset - start_offset; + if (len == 0) { + continue; + } + auto slice = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::Copy(x[x_idx].Slice(start_offset, end_offset), place, + dev_ctx, &slice); + out_offset += len; + } + } + out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end()); + } +}; + +class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(std::vector) A vector of tensors that is going to " + "be casted to a big LoDTensor."); + AddInput("RankTable", + "(LoDRankTable) RankTable provides the coarse lod infomation to " + "build the output LoDTensor. See " + "'paddle/framework/lod_rank_table.h' for more details."); + AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array."); + AddComment( + R"DOC(This Op build a big LoDTensor from a std::vector + and a LoDRankTable. It is supposed to be used in getting dynamic RNN's + outputs back to a normal LoDTensor. The std::vector + would be the output of RNN Op and the LoDRankTable would be build + with RNN's input.)DOC"); + } +}; + +class ArrayToLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "ArrayToLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("RankTable"), + "ArrayToLoDTensorOp must has input RankTable."); + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("lod_tensor_to_array"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("RankTable", Input("RankTable")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp, + ops::ArrayToLoDTensorOpProtoMaker, + ops::ArrayToLoDTensorInferShape, + ops::ArrayToLoDTensorGradMaker); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f99f9af4276c0e8928f821ae166d55aed02e8e27 --- /dev/null +++ b/paddle/fluid/operators/assign_op.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class AssignFunctor { + public: + AssignFunctor(framework::Variable *out, + const platform::DeviceContext &dev_ctx) + : out_(out), dev_ctx_(dev_ctx) {} + + void operator()(const framework::LoDTensor &lod_tensor) const { + auto &out_tensor = *out_->GetMutable(); + copy_tensor(lod_tensor, &out_tensor); + } + + void operator()(const framework::LoDTensorArray &array) const { + auto &out_array = *out_->GetMutable(); + out_array.resize(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + copy_tensor(array[i], &out_array[i]); + } + } + + void operator()(const framework::SelectedRows &rows) const { + framework::SelectedRows &out_rows = + *out_->GetMutable(); + out_rows.set_rows(rows.rows()); + out_rows.set_height(rows.height()); + auto &t = rows.value(); + auto *m = out_rows.mutable_value(); + framework::Copy(t, t.place(), dev_ctx_, m); + } + + template + void operator()(const T &v) const { + PADDLE_THROW("Not support type for assign op %s", typeid(T).name()); + } + + private: + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + auto &out_tensor = *out; + Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + framework::Variable *out_; + const platform::DeviceContext &dev_ctx_; +}; + +class AssignOp : public framework::OperatorBase { + public: + AssignOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) { + return; + } + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE( + out != nullptr, + "The Output(Out) should not be null if the Input(X) is set."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); + } +}; + +class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, SelectedRows or LoDTensorArray) The input variable " + "could be LoDTensor, SelectedRows or LoDTensorArray.") + .AsDispensable(); + AddOutput("Out", + "(LoDTensor, SelectedRows or LoDTensorArray) The type of output " + "is the same as input X."); + AddComment(R"DOC(Assign Operator + +Out = X, when type in [LoDTensor/SelectedRows/LoDTensorArray] +raise error if the type is not listed above. +)DOC"); + } +}; + +class AssignInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + if (context->HasInput("X")) { + auto type = context->GetInputsVarType("X")[0]; + if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS || + type == framework::proto::VarDesc_VarType_LOD_TENSOR) { + context->SetOutputDim("Out", context->GetInputDim("X")); + } + } + } +}; + +class AssignGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("assign"); + op->SetInput("X", OutputGrad("Out")); + op->SetOutput("Out", InputGrad("X")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, + ops::AssignInferShape, ops::AssignOpProtoMaker); diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..835043d9ab49a20a73d5dd0fff936cb3e9473b1e --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace paddle { +namespace operators { + +class AssignValueOp : public framework::OperatorWithKernel { + public: + AssignValueOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of AssignValueOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::proto::DataType(ctx.Attr("dtype")), ctx.GetPlace()); + } +}; + +class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) Output tensor of assign_value operator."); + AddAttr>("shape", + "(vector) " + "Shape of values."); + AddAttr("dtype", "data type of values") + .InEnum({framework::proto::DataType::INT32, + framework::proto::DataType::FP32}); + AddAttr>("fp32_values", "store the float values") + .SetDefault({}); + AddAttr>("int32_values", "store the int values") + .SetDefault({}); + AddComment(R"DOC( +AssignValue operator + +$$Out = values$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker); +REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..616163f97b9b917187ff66339c01f95289f2f618 --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cu.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h new file mode 100644 index 0000000000000000000000000000000000000000..33a344cad596a079faf2582ee1d9dc497531465a --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class AssignValueKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto shape = ctx.Attr>("shape"); + auto* out = ctx.Output("Out"); + int dtype = ctx.Attr("dtype"); + const char* value_name = nullptr; + switch (dtype) { + case framework::proto::DataType::INT32: + value_name = "int32_values"; + break; + case framework::proto::DataType::FP32: + value_name = "fp32_values"; + break; + default: + PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype); + break; + } + auto values = ctx.Attr>(value_name); + framework::CopyFromVector(values, ctx.device_context(), out); + out->Resize(framework::make_ddim(shape)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ac08ea4a19b981b0dc8dac43e4ae5de7b09bb5d --- /dev/null +++ b/paddle/fluid/operators/auc_op.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/auc_op.h" + +namespace paddle { +namespace operators { + +class AucOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input of Indices should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input of Label should not be null."); + auto inference_height = ctx->GetInputDim("Out")[0]; + auto label_height = ctx->GetInputDim("Label")[0]; + + PADDLE_ENFORCE_EQ(inference_height, label_height, + "Out and Label should have same height."); + + ctx->SetOutputDim("AUC", {1}); + ctx->ShareLoD("Out", /*->*/ "AUC"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Out")->type()), + ctx.device_context()); + } +}; + +class AucOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AucOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Out", + "A floating point 2D tensor, values are in the range [0, 1]." + "Each row is sorted in descending order. This input should be the" + "output of topk." + "Typically, this tensor indicates the probability of each label"); + AddInput("Indices", + "An int 2D tensor, indicating the indices of original" + "tensor before sorting. Typically, this tensor indicates which " + "label the probability stands for."); + AddInput("Label", + "A 2D int tensor indicating the label of the training data." + "The height is batch size and width is always 1."); + // TODO(typhoonzero): support weight input + AddOutput("AUC", + "A scalar representing the " + "current area-under-the-curve."); + + AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") + .SetDefault("ROC"); + AddAttr("num_thresholds", + "The number of thresholds to use when discretizing the" + " roc curve.") + .SetDefault(200); + + AddComment(R"DOC( +Area Under The Curve (AUC) Operator. + +This implementation computes the AUC according to forward output and label. +It is used very widely in binary classification evaluation. As a note: +If input label contains values other than 0 and 1, it will be cast +to bool. You can find the relevant definitions here: +https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve + +There are two types of possible curves: +1. ROC: Receiver operating characteristic +2. PR: Precision Recall +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e648db70974087f84020f45c568fb0c1924a88dd --- /dev/null +++ b/paddle/fluid/operators/auc_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +class AucKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* inference = ctx.Input("Out"); + auto* label = ctx.Input("Label"); + auto* auc = ctx.Output("AUC"); + + float* auc_data = auc->mutable_data(ctx.GetPlace()); + + std::string curve = ctx.Attr("curve"); + int num_thresholds = ctx.Attr("num_thresholds"); + std::vector thresholds_list; + thresholds_list.reserve(num_thresholds); + for (int i = 1; i < num_thresholds - 1; i++) { + thresholds_list[i] = (float)i / (num_thresholds - 1); + } + const float kEpsilon = 1e-7; + thresholds_list[0] = 0.0f - kEpsilon; + thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon; + + size_t batch_size = inference->dims()[0]; + size_t inference_width = inference->dims()[1]; + + const T* inference_data = inference->data(); + const int64_t* label_data = label->data(); + + // Create local tensor for storing the curve: TP, FN, TN, FP + // TODO(typhoonzero): use eigen op to caculate these values. + Tensor true_positive, false_positive, true_negative, false_negative; + + true_positive.Resize({num_thresholds}); + false_negative.Resize({num_thresholds}); + true_negative.Resize({num_thresholds}); + false_positive.Resize({num_thresholds}); + + int64_t* tp_data = true_positive.mutable_data(ctx.GetPlace()); + int64_t* fn_data = false_negative.mutable_data(ctx.GetPlace()); + int64_t* tn_data = true_negative.mutable_data(ctx.GetPlace()); + int64_t* fp_data = false_positive.mutable_data(ctx.GetPlace()); + + for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { + // caculate TP, FN, TN, FP for current thresh + int64_t tp = 0, fn = 0, tn = 0, fp = 0; + for (size_t i = 0; i < batch_size; i++) { + // NOTE: label_data used as bool, labels >0 will be treated as true. + if (label_data[i]) { + // use first(max) data in each row + if (inference_data[i * inference_width] >= + (thresholds_list[idx_thresh])) { + tp++; + } else { + fn++; + } + } else { + if (inference_data[i * inference_width] >= + (thresholds_list[idx_thresh])) { + fp++; + } else { + tn++; + } + } + } + // store rates + tp_data[idx_thresh] = tp; + fn_data[idx_thresh] = fn; + tn_data[idx_thresh] = tn; + fp_data[idx_thresh] = fp; + } + // epsilon to avoid divide by zero. + float epsilon = 1e-6; + // Riemann sum to caculate auc. + Tensor tp_rate, fp_rate, rec_rate; + tp_rate.Resize({num_thresholds}); + fp_rate.Resize({num_thresholds}); + rec_rate.Resize({num_thresholds}); + float* tp_rate_data = tp_rate.mutable_data(ctx.GetPlace()); + float* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); + float* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); + for (int i = 0; i < num_thresholds; i++) { + tp_rate_data[i] = + ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); + fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); + rec_rate_data[i] = + ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); + } + *auc_data = 0.0f; + if (curve == "ROC") { + for (int i = 0; i < num_thresholds - 1; i++) { + auto dx = fp_rate_data[i] - fp_rate_data[i + 1]; + auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f; + *auc_data = *auc_data + dx * y; + } + } else if (curve == "PR") { + for (int i = 1; i < num_thresholds; i++) { + auto dx = tp_rate_data[i] - tp_rate_data[i - 1]; + auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f; + *auc_data = *auc_data + dx * y; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..506c25d50d453ef841e6885c412ccff38f25cebb --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -0,0 +1,448 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/framework/data_layout.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class BatchNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); + PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); + PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); + PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); + PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); + PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); + + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + "Mean and MeanOut should share the same memory"); + PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], + ctx->Outputs("VarianceOut")[0], + "Variance and VarianceOut should share the same memory"); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); + } +}; + +class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("is_test", "").SetDefault(false); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddComment(R"DOC( +Batch Normalization. + +Batch Norm has been implemented as discussed in the paper: +https://arxiv.org/pdf/1502.03167.pdf +Can be used as a normalizer function for conv2d and fully_connected operations. +The required data format for this layer is one of the following: +1. NHWC `[batch, in_height, in_width, in_channels]` +2. NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); + } +}; + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + if (!is_test) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e( + saved_mean->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap saved_variance_e( + saved_variance->mutable_data(ctx.GetPlace()), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), C); + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (is_test) { + ConstEigenVectorArrayMap var_arr( + ctx.Input("Variance")->data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std( + ctx.Output("SavedVariance")->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + is_test ? ctx.Input("Mean")->data() + : ctx.Output("SavedMean")->data(), + C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap bias_arr(bias->data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, + N * C); + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, + N * sample_size) = + (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW("Unknown storage order: %d", data_layout); + } + } +}; + +class BatchNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), ""); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), ""); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *saved_mean = ctx.Input("SavedMean"); + // SavedVariance have been reverted in forward operator + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap mean_arr(saved_mean->data(), C); + ConstEigenVectorArrayMap inv_var_arr(saved_inv_variance->data(), C); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + + EigenVectorArrayMap d_bias_arr(d_bias->mutable_data(ctx.GetPlace()), + C); + EigenVectorArrayMap d_scale_arr(d_scale->mutable_data(ctx.GetPlace()), + C); + + d_bias_arr.setZero(); + d_scale_arr.setZero(); + + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), + sample_size, N * C); + d_x_arr.setZero(); + + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_bias_arr(c) += d_y_arr.col(nc).sum(); + d_scale_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c)); + } + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, + N * sample_size); + d_x_arr.setZero(); + + const auto d_y_row_sum = d_y_arr.rowwise().sum(); + const auto x_minus_mean = x_arr.colwise() - mean_arr; + const auto d_y_mul_x_minus_mean_row_sum = + (d_y_arr * x_minus_mean).rowwise().sum(); + const auto inv_var_sqr = inv_var_arr * inv_var_arr; + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_bias_arr += d_y_arr.col(nhw); + d_scale_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + d_x_arr.col(nhw) += + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - + x_minus_mean.col(nhw) * inv_var_sqr * + d_y_mul_x_minus_mean_row_sum); + } + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + batch_norm_grad, ops::BatchNormGradOp); +REGISTER_OP_CPU_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm_grad, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9c97211e14c0ef3a99a7e2b5cbfd8b267d40c1e --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -0,0 +1,278 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/framework/data_layout.h" + +#include +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; + +void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, + int *N, int *C, int *H, int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + VLOG(1) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); + math::SetConstant functor; + functor(dev_ctx, saved_mean, 0); + functor(dev_ctx, saved_variance, 0); + + auto handle = dev_ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + if (is_test) { + // only when test we use input to do computation. + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + // Run inference mode. + PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); + PADDLE_ENFORCE_EQ(est_var->dims()[0], C); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), + bn_param_desc_, scale->template data(), bias->template data(), + est_mean->template data(), est_var->template data(), epsilon)); + } else { + // Run training mode. + // obtain running mean and running inv var, and see if we need to + // initialize them. + double this_factor = 1. - momentum; + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), + data_desc_, x->template data(), data_desc_, + y->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data(), bias->template data(), this_factor, + mean_out->template mutable_data(ctx.GetPlace()), + variance_out->template mutable_data(ctx.GetPlace()), epsilon, + saved_mean->template mutable_data(ctx.GetPlace()), + saved_variance->template mutable_data(ctx.GetPlace()))); + } + + // clean when exit. + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const void *saved_mean_data = saved_mean->template data(); + const void *saved_var_data = saved_var->template data(); + + auto &dev_ctx = ctx.template device_context(); + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, + d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data(), + d_scale->template mutable_data(ctx.GetPlace()), + d_bias->template mutable_data(ctx.GetPlace()), epsilon, + saved_mean_data, saved_var_data)); + + // clean when exit. + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( + batch_norm_grad, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fa9942ad099f4a28a3abc68c676edeeb827aacd0 --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class BatchNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class BatchNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7737d4e098ac9a0e56e1db2aee796550e8d71ba3 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct BeamSearchDecodeFunctor { + BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, LoDTensor* score_tensor) + : step_ids_(step_ids), + step_scores_(step_scores), + id_tensor_(id_tensor), + score_tensor_(score_tensor) {} + + template + void operator()() const; + + const LoDTensorArray& step_ids_; + const LoDTensorArray& step_scores_; + LoDTensor* id_tensor_; + LoDTensor* score_tensor_; +}; + +template +void BeamSearchDecodeFunctor::operator()() const { + BeamSearchDecoder beam_search_decoder; + beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, + score_tensor_); +} + +template <> +void BeamSearchDecodeFunctor::operator()() const { + PADDLE_THROW("beam search decode op does not support bool!"); +} + +class BeamSearchDecodeOp : public framework::OperatorBase { + public: + BeamSearchDecodeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); + + framework::ExecutionContext ctx(*this, scope, dev_ctx); + + const LoDTensorArray* ids = ctx.Input("Ids"); + const LoDTensorArray* scores = ctx.Input("Scores"); + const size_t step_num = ids->size(); + PADDLE_ENFORCE_GT(step_num, 0UL, + "beam search steps should be larger than 0"); + const size_t source_num = ids->at(0).lod().at(0).size() - 1; + PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0"); + + for (size_t i = 0; i < step_num; ++i) { + PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL, + "Level of LodTensor should be 2"); + } + + // prepare output + LoDTensor* sentenceIds = ctx.Output("SentenceIds"); + LoDTensor* sentenceScores = ctx.Output("SentenceScores"); + + framework::VisitDataType( + framework::ToDataType(scores->at(0).type()), + BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores)); + } +}; + +class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ids", + "(LodTensorArray)" + "score of the candidate words in each step"); + AddInput("Scores", + "(LodTensorArray)" + "score of the candidate words in each step"); + AddOutput("SentenceIds", + "(LodTensor)" + "All possible result sentences of word ids"); + AddOutput("SentenceScores", + "(LodTensor)" + "All possible result sentences of word scores"); + AddComment(R"DOC( +Pack the result of Beam search op into SentenceIds and SentenceScores. +)DOC"); + } +}; + +class BeamSearchDecodeInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("Ids"), + "BeamSearchDecodeOp must has input Ids"); + PADDLE_ENFORCE(context->HasInput("Scores"), + "BeamSearchDecodeOp must has input Scores"); + PADDLE_ENFORCE(context->HasOutput("SentenceIds"), + "BeamSearchDecodeOp must has output SentenceIds"); + PADDLE_ENFORCE(context->HasOutput("SentenceScores"), + "BeamSearchDecodeOp must has output SentenceScores"); + } +}; + +class BeamSearchDecodeInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + for (auto& o : op_desc.Output("SentenceIds")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto& o : op_desc.Output("SentenceScores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp, + paddle::operators::BeamSearchDecodeOpProtoMaker, + paddle::operators::BeamSearchDecodeInferShape, + paddle::operators::BeamSearchDecodeInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aeecb8d39acf1e2761aec62b89322c9cbbfe7445 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -0,0 +1,280 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; + +// all the lod have 2 levels. +// The First is source level, the second is sentence level. +// source level describe how many candidate words for this source. +// sentence level describe these candidates belong to which prefix +const size_t kSourceLevel = 0; +const size_t kSentenceLevel = 1; + +template +struct BeamNode { + BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {} + + ~BeamNode() { + if (parent_) { + parent_->DropKid(this); + if (parent_->kids_.size() == 0UL) { + delete parent_; + } + } + VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_; + } + + void AppendTo(BeamNode* parent) { + parent_ = parent; + parent->kids_.insert(this); + } + + void DropKid(BeamNode* kid) { kids_.erase(kid); } + + BeamNode* parent_ = nullptr; + std::unordered_set kids_; + int64_t word_id_; + T score_; +}; + +template +using BeamNodeVector = std::vector>>; + +template +struct Sentence { + std::vector word_ids; + std::vector scores; +}; + +template +using SentenceVector = std::vector>; + +template +struct BeamSearchDecoder { + /** + * make a BeamNode and all it's related prefix BeanNode into a Sentence. + */ + Sentence MakeSentence(const BeamNode* node) const; + + /** + * Param: + * cur_ids: LoDTensor of One step for word ID + * cur_scores: LoDTensor of One Step for word score + * prefixes_list: prefixes for each source sentence. + * sentence_vector_list: result sentence_vector for each source sentence. + * Return: + * a new prefixes list for each source of current step + */ + std::vector> PackTwoSteps( + const LoDTensor& cur_ids, const LoDTensor& cur_scores, + std::vector>& prefixes_list, + std::vector>* sentence_vector_list) const; + + /** + * convert the result sentence_vector for each source sentence into two + * LodTensor. + * One is all candidate sentences with word id, one is all candidate sentences + * with word score. + * Param: + * sentence_vector_list: sentence_vector for each source sentence. + * id_tensor: result LoDTensor for sentences of id. + * score_tensor: result LoDTensor for sentences of score. + */ + void ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; + + /** + * Pack all steps of id/score LodTensor into sentence LoDTensor + * it's main logic is: + * ```python + * prefix + * result_sentence + * result_lod_tensor + * + * for (step in steps): + * prefix = PackTwoSteps(prefix, step, &result_sentence) + * ConvertSentenceVectorToLodTensor(result_sentence, &result_lod_tensor) + * ``` + */ + void PackAllSteps(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; +}; + +template +Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const { + Sentence sentence; + while (node != nullptr) { + sentence.word_ids.emplace_back(node->word_id_); + sentence.scores.emplace_back(node->score_); + node = node->parent_; + } + + std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids)); + std::reverse(std::begin(sentence.scores), std::end(sentence.scores)); + + return sentence; +} + +template +std::vector> BeamSearchDecoder::PackTwoSteps( + const LoDTensor& cur_ids, const LoDTensor& cur_scores, + std::vector>& prefixes_list, + std::vector>* sentence_vector_list) const { + std::vector> result; + + for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1; + ++src_idx) { + size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + + BeamNodeVector beam_nodes; + + // if prefixes size is 0, it means this is the first step. In this step, + // all candidate id is the start of candidate sentences. + if (prefixes_list.empty()) { + PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(), + cur_ids.lod().at(kSentenceLevel).back(), + "in the first step"); + for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) { + beam_nodes.push_back(std::unique_ptr>(new BeamNode( + cur_ids.data()[id_idx], cur_scores.data()[id_idx]))); + } + } else { + BeamNodeVector& prefixes = prefixes_list[src_idx]; + SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx]; + + PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(), + "prefix and candidate set number should be the same"); + + auto candidate_offset = cur_ids.lod()[kSentenceLevel]; + for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) { + std::unique_ptr>& prefix = prefixes[prefix_idx]; + size_t candidate_start = candidate_offset[src_start + prefix_idx]; + size_t candidate_end = candidate_offset[src_start + prefix_idx + 1]; + if (candidate_start == candidate_end) { + VLOG(3) << "this sentence has no more candidate, " + "add to result sentence and rm it from beam tree"; + sentence_vector.push_back(MakeSentence(prefix.get())); + prefix.reset(); + } else { + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; ++candidate_idx) { + auto* candidate = + new BeamNode(cur_ids.data()[candidate_idx], + cur_scores.data()[candidate_idx]); + candidate->AppendTo(prefix.get()); + beam_nodes.push_back(std::unique_ptr>(candidate)); + } + prefix.release(); + } + } + } + result.push_back(std::move(beam_nodes)); + } + return result; +} + +template +void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + size_t src_num = sentence_vector_list.size(); + + PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); + + std::vector source_level_lod = {0}; + std::vector sentence_level_lod = {0}; + std::vector id_data; + std::vector score_data; + + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + for (Sentence& sentence : sentence_vector_list[src_idx]) { + id_data.insert(id_data.end(), sentence.word_ids.begin(), + sentence.word_ids.end()); + score_data.insert(score_data.end(), sentence.scores.begin(), + sentence.scores.end()); + sentence_level_lod.push_back(sentence_level_lod.back() + + sentence.word_ids.size()); + } + source_level_lod.push_back(source_level_lod.back() + + sentence_vector_list[src_idx].size()); + } + + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + + framework::LoD lod; + lod.push_back(source_level_lod); + lod.push_back(sentence_level_lod); + + id_tensor->set_lod(lod); + id_tensor->Resize({static_cast(id_data.size())}); + id_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::CopyFromVector(id_data, cpu_ctx, id_tensor); + + score_tensor->set_lod(lod); + score_tensor->Resize({static_cast(score_data.size())}); + score_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::CopyFromVector(score_data, cpu_ctx, score_tensor); +} + +template +void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); + PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), + "step_ids and step_scores should be the same"); + const size_t step_num = step_ids.size(); + const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + + PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0"); + + // previous prefixes for each step, + // the init length is 0, means this is the first step. + std::vector> beamnode_vector_list(0); + std::vector> sentence_vector_list(src_num); + + // pack all steps for one batch first, then another batch + for (size_t step_id = 0; step_id < step_num; ++step_id) { + beamnode_vector_list = + PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id), + beamnode_vector_list, &sentence_vector_list); + } + // append last beam_node to result + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + for (auto& beam_node : beamnode_vector_list.at(src_idx)) { + sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get())); + beam_node.reset(); + } + } + + ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, + score_tensor); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f87279d5eaa19715c31b2228c7a22d4723efae --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -0,0 +1,221 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "gtest/gtest.h" + +using CPUPlace = paddle::platform::CPUPlace; +using LoD = paddle::framework::LoD; +using LoDTensor = paddle::framework::LoDTensor; +using LoDTensorArray = paddle::framework::LoDTensorArray; + +template +using BeamNode = paddle::operators::BeamNode; +template +using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; +template +using Sentence = paddle::operators::Sentence; +template +using BeamNodeVector = paddle::operators::BeamNodeVector; +template +using SentenceVector = paddle::operators::SentenceVector; + +namespace paddle { +namespace test { + +void GenerateExample(const std::vector& level_0, + const std::vector& level_1, + const std::vector& data, LoDTensorArray* ids, + LoDTensorArray* scores) { + PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1, + "source level is used to describe candidate set"); + PADDLE_ENFORCE_EQ(level_1.back(), data.size(), + "the lowest level is used to describe data" + ", so it's last element should be data length"); + + CPUPlace place; + + LoD lod; + lod.push_back(level_0); + lod.push_back(level_1); + + // Ids + LoDTensor tensor_id; + tensor_id.set_lod(lod); + tensor_id.Resize({static_cast(data.size())}); + // malloc memory + int64_t* id_ptr = tensor_id.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + id_ptr[i] = static_cast(data.at(i)); + } + + // Scores + LoDTensor tensor_score; + tensor_score.set_lod(lod); + tensor_score.Resize({static_cast(data.size())}); + // malloc memory + float* score_ptr = tensor_score.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + score_ptr[i] = static_cast(data.at(i)); + } + + ids->push_back(tensor_id); + scores->push_back(tensor_score); +} + +} // namespace test +} // namespace paddle + +TEST(BeamSearchDecodeOp, DeleteBeamNode) { + auto* root = new BeamNode(0, 0); + auto* b1 = new BeamNode(1, 1); + auto* b2 = new BeamNode(2, 2); + auto* b3 = new BeamNode(3, 3); + + b1->AppendTo(root); + b2->AppendTo(root); + b3->AppendTo(b1); + + delete b3; + delete b2; +} + +TEST(BeamSearchDecodeOp, MakeSentence) { + auto* root = new BeamNode(0, 0); + auto* b1 = new BeamNode(1, 1); + auto* end = new BeamNode(2, 2); + b1->AppendTo(root); + end->AppendTo(b1); + + BeamSearchDecoder helper; + Sentence sentence = helper.MakeSentence(end); + delete end; + + std::vector expect_ids = {0, 1, 2}; + ASSERT_EQ(sentence.word_ids, expect_ids); + + std::vector expect_scores = {0, 1, 2}; + ASSERT_EQ(sentence.scores, expect_scores); +} + +TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) { + CPUPlace place; + + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample( + std::vector{0, 2, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, + std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + + std::vector> beamnode_vector_list; + std::vector> sentence_vector_list( + 2, SentenceVector()); + + BeamSearchDecoder helper; + beamnode_vector_list = helper.PackTwoSteps( + ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); + ASSERT_EQ(beamnode_vector_list.size(), 2UL); + ASSERT_EQ(beamnode_vector_list[0].size(), 2UL); + ASSERT_EQ(beamnode_vector_list[1].size(), 4UL); +} + +TEST(BeamSearchDecodeOp, PackTwoSteps) { + CPUPlace place; + + // first source has three prefix + BeamNodeVector source0_prefixes; + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(1, 1))); + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(0, 0))); + source0_prefixes.push_back( + std::unique_ptr>(new BeamNode(3, 3))); + + // second source has two prefix + BeamNodeVector source1_prefixes; + source1_prefixes.push_back( + std::unique_ptr>(new BeamNode(4, 4))); + source1_prefixes.push_back( + std::unique_ptr>(new BeamNode(5, 5))); + + std::vector> beamnode_vector_list; + std::vector> sentence_vector_list( + 2, SentenceVector()); + + beamnode_vector_list.push_back(std::move(source0_prefixes)); + beamnode_vector_list.push_back(std::move(source1_prefixes)); + + // generate data for one step + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample(std::vector{0, 3, 5}, + std::vector{0, 1, 1, 3, 4, 5}, + std::vector{0, 1, 2, 3, 4}, &ids, &scores); + + BeamSearchDecoder helper1; + beamnode_vector_list = helper1.PackTwoSteps( + ids[0], scores[0], beamnode_vector_list, &sentence_vector_list); + + ASSERT_EQ(sentence_vector_list[0].size(), 1UL); + ASSERT_EQ(sentence_vector_list[1].size(), 0UL); + ASSERT_EQ(beamnode_vector_list[0].size(), 3UL); + ASSERT_EQ(beamnode_vector_list[1].size(), 2UL); +} + +TEST(BeamSearchDecodeOp, PackAllSteps) { + CPUPlace place; + + // we will constuct a sample data with 3 steps and 2 source sentences + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample( + std::vector{0, 3, 6}, std::vector{0, 1, 2, 3, 4, 5, 6}, + std::vector{1, 2, 3, 4, 5, 6}, &ids, &scores); + paddle::test::GenerateExample( + std::vector{0, 3, 6}, std::vector{0, 1, 1, 3, 5, 5, 6}, + std::vector{0, 1, 2, 3, 4, 5}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 3, 6}, + std::vector{0, 0, 1, 2, 3, 4, 5}, + std::vector{0, 1, 2, 3, 4}, &ids, &scores); + + ASSERT_EQ(ids.size(), 3UL); + ASSERT_EQ(scores.size(), 3UL); + + BeamSearchDecoder helper; + + LoDTensor id_tensor; + LoDTensor score_tensor; + helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor); + + LoD lod = id_tensor.lod(); + std::vector expect_source_lod = {0, 4, 8}; + EXPECT_EQ(lod[0], expect_source_lod); + std::vector expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19}; + EXPECT_EQ(lod[1], expect_sentence_lod); + // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4 + std::vector expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5, + 4, 3, 2, 4, 4, 3, 6, 5, 4}; + ASSERT_EQ(id_tensor.dims()[0], static_cast(expect_data.size())); + for (size_t i = 0; i < expect_data.size(); ++i) { + ASSERT_EQ(id_tensor.data()[i], + static_cast(expect_data[i])); + } + for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) { + ASSERT_EQ(score_tensor.data()[i], + static_cast(id_tensor.data()[i])); + } +} diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f4c8c7e06ee17b4cf3880db7bc8ddfbb88df3b8 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cc @@ -0,0 +1,258 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +void BeamSearch::operator()(const framework::LoDTensor &pre_ids, + framework::LoDTensor *selected_ids, + framework::LoDTensor *selected_scores) { + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + + auto items = SelectTopBeamSizeItems(); + auto selected_items = ToMap(items, high_level.back()); + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset:" << i; + for (auto &item : selected_items[i]) { + VLOG(3) << ItemToString(item); + } + } + PruneEndidCandidates(pre_ids, &selected_items); + // calculate the output tensor's height + size_t num_instances = std::accumulate( + std::begin(selected_items), std::end(selected_items), 0, + [](size_t a, std::vector &b) { return a + b.size(); }); + // the output tensor shape should be [num_instances, 1] + auto dims = framework::make_ddim( + std::vector({static_cast(num_instances), 1})); + selected_ids->Resize(dims); + selected_scores->Resize(dims); + + std::map> hash; + framework::LoD new_lod; + auto *ids_data = selected_ids->mutable_data(platform::CPUPlace()); + auto *scores_data = + selected_scores->mutable_data(platform::CPUPlace()); + + // fill in data + std::vector low_level; + size_t low_offset = 0; + for (auto &items : selected_items) { + low_level.push_back(low_offset); + sort(items.begin(), items.end(), [](const Item &a, const Item &b) { + if (a.offset < b.offset) { + return true; + } + return a.id < b.id; + }); + for (auto &item : items) { + ids_data[low_offset] = item.id; + scores_data[low_offset] = item.score; + low_offset++; + } + } + low_level.push_back(low_offset); + + // fill lod + framework::LoD lod(2); + lod[0].assign(high_level.begin(), high_level.end()); + lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } + selected_ids->set_lod(lod); + selected_scores->set_lod(lod); +} + +int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, + std::vector> *items) { + auto *pre_ids_data = pre_ids.data(); + + int res = 0; + for (size_t offset = 0; offset < items->size(); offset++) { + auto prefix_id = pre_ids_data[offset]; + if (prefix_id == end_id_) { + items->at(offset).clear(); + } else { + res++; + } + } + + return res; +} + +std::vector> BeamSearch::ToMap( + const std::vector> &items, size_t element_num) { + std::vector> result; + result.resize(element_num); + for (auto &entries : items) { + for (const auto &item : entries) { + result[item.offset].push_back(item); + } + } + return result; +} + +std::vector> +BeamSearch::SelectTopBeamSizeItems() { + std::vector> result; + std::vector items; + // for each source sentence, select the top beam_size items across all + // candidate sets. + while (NextItemSet(&items)) { + std::nth_element(std::begin(items), std::begin(items) + beam_size_, + std::end(items), [](const Item &a, const Item &b) { + // TODO(superjom) make score's comparation customizable. + // partial sort in descending order + return a.score > b.score; + }); + // prune the top beam_size items. + if (items.size() > beam_size_) { + items.resize(beam_size_); + } + result.emplace_back(items); + } + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << ItemToString(item); + } + } + + return result; +} + +// the candidates of a source +bool BeamSearch::NextItemSet(std::vector *items) { + if (sent_offset_ >= ids_->NumElements(lod_level_)) { + return false; + } + // find the current candidates + auto ids = *ids_; + auto scores = *scores_; + + auto abs_lod = framework::ToAbsOffset(ids.lod()); + + auto *ids_data = ids.data(); + auto *scores_data = scores.data(); + + size_t instance_dim = 1; + for (int i = 1; i < ids.dims().size(); i++) { + instance_dim *= ids.dims()[i]; + } + + items->clear(); + items->reserve(framework::product(ids.dims())); + for (size_t offset = abs_lod[lod_level_][sent_offset_]; + offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { + for (size_t d = 0; d < instance_dim; d++) { + const size_t dim_offset = offset * instance_dim + d; + items->emplace_back(offset, ids_data[dim_offset], + scores_data[dim_offset]); + } + } + + sent_offset_++; + return true; +} + +std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { + os << "{"; + os << "offset: " << item.offset << ", "; + os << "id: " << item.id << ", "; + os << "score: " << item.score << ""; + os << "}"; + + return os; +} + +std::string ItemToString(const BeamSearch::Item &item) { + std::ostringstream stream; + stream << item; + return stream.str(); +} + +class BeamSearchProtoAndCheckerMaker + : public framework::OpProtoAndCheckerMaker { + public: + BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + // inputs and outputs stored in proto + AddInput("pre_ids", "ids in previous step"); + AddInput("ids", "a LoDTensor of shape of [None,k]"); + AddInput("scores", + "a LoDTensor that has the same shape and LoD with `ids`"); + AddOutput("selected_ids", + "a LoDTensor that stores the IDs selected by beam search"); + AddOutput( + "selected_scores", + "a LoDTensor that has the same shape and LoD with `selected_ids`"); + + // Attributes stored in AttributeMap + AddAttr("level", "the level of LoDTensor"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); + + AddComment( + "This is a beam search operator that help to generate sequences."); + } +}; + +class BeamSearchInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + for (const std::string &arg : + std::vector({"pre_ids", "ids", "scores"})) { + PADDLE_ENFORCE(context->HasInput(arg), + "BeamSearch need input argument '%s'", arg); + } + for (const std::string &arg : + std::vector({"selected_ids", "selected_scores"})) { + PADDLE_ENFORCE(context->HasOutput(arg), + "BeamSearch need output argument '%s'", arg); + } + } +}; + +class BeamSearchInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("selected_ids")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto &o : op_desc.Output("selected_scores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, + paddle::operators::BeamSearchProtoAndCheckerMaker, + paddle::operators::BeamSearchInferShape, + paddle::operators::BeamSearchInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9e2a05a60c30e388093aceddd40e58273364c8f9 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.h @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_TESTING +#include "gtest/gtest.h" +#endif + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +/* + * This is an implementation of beam search. + * + * To explain the details, lets take machine translation task for example, in + * this task, one source sentence is translated to multiple target sentences, + * during this period, one sentence will be translated to multiple translation + * prefixes(target sentence that have not ended), in each time step a prefix + * will have some candidates, input the candidate ids and their corresponding + * scores (probabilities), it will sort and select the top beam_size candidates + * for each source sentence, and store the selected candidates's score and their + * corresponding ids to LoDTensors. + * + * A detailed example: + * + * Input + * + * ids: + * LoD (should have 2 levels) + * first level: [0, 1, 4] + * second level: [0, 1, 2, 3, 4] + * + * tensor's data + * [ + * [4, 2, 5] + * [2, 1, 3] + * [3, 5, 2] + * [8, 2, 1] + * ] + * + * scores: + * LoD same as `ids` + * tensor's data + * [ + * [0.5, 0.3, 0.2] + * [0.6, 0.3, 0.1] + * [0.9, 0.5, 0.1] + * [0.7, 0.5, 0.1] + * ] + * + * the inputs means that there are 2 source sentences to translate, and the + * first source has 1 prefix, the second source has 2 prefix. + * + * lets assume beam size is 2, and the beam search's output should be + * LoD + * first level: + * [0, 1, 2] + * second level: + * [0, 2, 4] + * + * id tensor's data + * [[ + * 4, + * 1, + * 3, + * 8, + * ]] + * + * score tensor's data + * [[ + * 0.5, + * 0.3, + * 0.9, + * 0.7 + * ]] + * + * TODO all the prune operations should be in the beam search, so it is better + * to split the beam search algorithm into a sequence of smaller operators, and + * the prune operators can be inserted in this sequence. + */ +class BeamSearch { + public: + // TODO(superjom) make type customizable + using id_t = size_t; + using score_t = float; + /* + * Input the arguments that needed by this class. + */ + BeamSearch(const framework::LoDTensor& ids, + const framework::LoDTensor& scores, size_t level, size_t beam_size, + int end_id) + : beam_size_(beam_size), + ids_(&ids), + scores_(&scores), + lod_level_(level), + end_id_(end_id) {} + + /* + * The main function of beam search. + * + * @selected_ids: a [None, 1]-shaped tensor with LoD. + * In a machine translation model, it might be the candidate term id sets, + * each set stored as a varience-length sequence. + * The format might be described with a two-level LoD + * - [[0 1] + * - [0 1 2]] + * - [[] + * - [0 1]] + * the first level of LoD tells that there are two source sentences. The + * second level describes the details of the candidate id set's offsets in + * the + * source sentences. + * + * @selected_scores: a LoD tensor with the same shape and LoD with + * selected_ids. + * It stores the corresponding scores of candidate ids in selected_ids. + * + * Return false if all the input tensor is empty, in machine translation task + * that means no candidates is provided, and the task will stop running. + */ + void operator()(const framework::LoDTensor& pre_ids, + framework::LoDTensor* selected_ids, + framework::LoDTensor* selected_scores); + /* + * The basic items help to sort. + */ + struct Item { + Item() {} + Item(size_t offset, size_t id, float score) + : offset(offset), id(id), score(score) {} + // offset in the higher lod level. + size_t offset; + // // prefix id in the lower lod level. + // size_t prefix; + // the candidate id + id_t id; + // the corresponding score + score_t score; + }; + + protected: + /* + * Delete all the records that follows the end token. + */ + int PruneEndidCandidates(const framework::LoDTensor& pre_ids, + std::vector>* items); + + /* + * Transform the items into a map whose key is offset, value is the items. + * NOTE low performance + */ + std::vector> ToMap( + const std::vector>& inputs, size_t element_num); + + /* + * For each source, select top beam_size records. + */ + std::vector> SelectTopBeamSizeItems(); + + /* + * Get the items of next source sequence, return false if no remaining items. + */ + bool NextItemSet(std::vector* items); + + private: + size_t beam_size_; + const framework::LoDTensor* ids_; + const framework::LoDTensor* scores_; + size_t lod_level_{0}; + size_t sent_offset_{0}; + int end_id_{0}; +}; + +std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); + +std::string ItemToString(const BeamSearch::Item& item); + +class BeamSearchOp : public framework::OperatorBase { + public: + BeamSearchOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + BeamSearchOp(const BeamSearchOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not Implemented"); + } + + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto ids_var = scope.FindVar(Input("ids")); + auto scores_var = scope.FindVar(Input("scores")); + auto pre_ids_var = scope.FindVar(Input("pre_ids")); + PADDLE_ENFORCE_NOT_NULL(ids_var); + PADDLE_ENFORCE_NOT_NULL(scores_var); + PADDLE_ENFORCE_NOT_NULL(pre_ids_var); + + auto& ids = ids_var->Get(); + auto& scores = scores_var->Get(); + auto& pre_ids = pre_ids_var->Get(); + size_t level = Attr("level"); + size_t beam_size = Attr("beam_size"); + int end_id = Attr("end_id"); + BeamSearch alg(ids, scores, level, beam_size, end_id); + + auto selected_ids_var = scope.FindVar(Output("selected_ids")); + auto selected_scores_var = scope.FindVar(Output("selected_scores")); + PADDLE_ENFORCE_NOT_NULL(selected_ids_var); + PADDLE_ENFORCE_NOT_NULL(selected_scores_var); + auto& selected_ids_tensor = + *selected_ids_var->GetMutable(); + auto& selected_scores_tensor = + *selected_scores_var->GetMutable(); + alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea2afda4d492ccde8889a394201b398eeff3badb --- /dev/null +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" + +#include +#include + +namespace paddle { +namespace test { + +using std::vector; +using framework::LoDTensor; +using framework::LoD; +using operators::BeamSearch; +using paddle::platform::CPUPlace; +using std::cout; +using std::endl; + +void CreateInput(LoDTensor* ids, LoDTensor* scores) { + LoD lod; + vector level0({0, 1, 4}); + vector level1({0, 1, 2, 3, 4}); + lod.push_back(level0); + lod.push_back(level1); + ids->set_lod(lod); + scores->set_lod(lod); + + auto dims = framework::make_ddim(vector({4, 3})); + ids->Resize(dims); + scores->Resize(dims); + CPUPlace place; + + auto* ids_data = ids->mutable_data(place); + auto* scores_data = scores->mutable_data(place); + vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + vector _scores( + {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1}); + + for (int i = 0; i < 12; i++) { + ids_data[i] = _ids[i]; + scores_data[i] = _scores[i]; + } +} + +TEST(beam_search_op, run) { + CPUPlace place; + LoDTensor ids, scores; + CreateInput(&ids, &scores); + + LoDTensor pre_ids; + pre_ids.Resize(framework::make_ddim(vector(4, 1))); + for (int i = 0; i < 4; i++) { + pre_ids.mutable_data(place)[i] = i + 1; + } + + BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0); + LoDTensor sids, sscores; + beamsearch(pre_ids, &sids, &sscores); + + LOG(INFO) << "score: " << sscores << endl; + + ASSERT_EQ(sids.lod(), sscores.lod()); + + vector tids({2, 4, 3, 8}); + vector tscores({0.3, 0.5, 0.9, 0.7}); + + for (int i = 0; i < 4; i++) { + ASSERT_EQ(tids[i], sids.data()[i]); + ASSERT_EQ(tscores[i], sscores.data()[i]); + } +} + +} // namespace test +} // namespace paddle diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc378b1b4536273e4364a488eb7a4ca2cc706782 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class BilinearTensorProductOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL, + "The input(Weight) must be a 3D tensor."); + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The first dimension(batch_size) of input(X) must be " + "equal to the first dimension of the input(Y)."); + PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1], + "The second dimension of input(X) must be equal to " + "the second dimension of the input(Weight)."); + PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2], + "The second dimension of input(Y) must be equal to " + "the third dimension of the input(Weight)."); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL, + "The Input(Bias) must be a 2-D tensor with " + "the 2nd dimension fixed to 1 (a row vector)."); + PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0], + "The second dimension of input(Bias) must be equal " + "to the first dimension of the input(Weight)."); + } + + ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of bilinear_tensor_product operator."); + AddInput("Y", "The second input of bilinear_tensor_product operator."); + AddInput("Weight", + "The learnable parameters of bilinear_tensor_product operator."); + AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.") + .AsDispensable(); + AddOutput("Out", "The output of bilinear_tensor_product operator."); + AddComment(R"DOC( +Bilinear Tensor Product operator. +Given input X and Y, a 3D tensor Weight and a Bias. Each column of the +Output is computed by one slice $i = 1, . . . , k$ of the tensor: + +$$ +M = (X W_i) * Y \\ +Out_i = \sum_j {M_j} + Bias_i +$$ + +Where $W_i$ is the $i$-th slice of Input(Weight); + $M_j$ is the $j$-th column of $M$; + $Out_i$ is the $i$-th column of Output(Out); + $Bias_i$ is a column vector, each element of it is equal to + the $i$-th element of $Bias$; + +)DOC"); + } +}; + +class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(out_dims.size(), 2UL, + "The input(Out@GRAD) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ( + x_dims[0], out_dims[0], + "The first dimension(batch_size) of input(Out@GRAD) must be " + "equal to the first dimension of the Input(X)."); + PADDLE_ENFORCE_EQ( + weight_dims[0], out_dims[1], + "The second dimension of input(Out@GRAD) must be equal to " + "the third dimension of the Input(Weight)."); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ( + bias_dims[1], out_dims[1], + "The second dimension of input(Out@GRAD) must be equal to " + "the second dimension of the Input(Bias)."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + auto weight_grad_name = framework::GradVarName("Weight"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + if (ctx->HasOutput(weight_grad_name)) { + ctx->SetOutputDim(weight_grad_name, weight_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp, + ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad, + ops::BilinearTensorProductOpGrad); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2cec48ee69ac500c0b0ba84f4b6fc20415f4b82c --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h new file mode 100644 index 0000000000000000000000000000000000000000..626fa957c42c02c978519c1869cd5f0679d22a26 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.h @@ -0,0 +1,185 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class BilinearTensorProductKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto y_mat = EigenMatrix::From(*y); + auto output_mat = EigenMatrix::From(*out); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + + // Create the intermediate variable to caculate the result of + // Input(X) multiplied by Input(Weight_i), the formula is: + // left_mul = X Weight_i. + Tensor left_mul; + left_mul.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto left_mul_mat = EigenMatrix::From(left_mul); + + for (int i = 0; i < out_dim; ++i) { + auto output_col_vec = output_mat.chip(i, 1); + Tensor weight_mat = + weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); + math::gemm(dev_ctx, CblasNoTrans, CblasNoTrans, + batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); + output_col_vec.device(place) = + (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); + } + if (bias) { + auto bias_vec = EigenMatrix::From(*bias); + Eigen::DSizes bcast(batch_size, 1); + output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; + } + } +}; + +template +class BilinearTensorProductGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* weight = ctx.Input("Weight"); + Tensor* d_x = ctx.Output(framework::GradVarName("X")); + Tensor* d_y = ctx.Output(framework::GradVarName("Y")); + Tensor* d_weight = ctx.Output(framework::GradVarName("Weight")); + Tensor* d_bias = ctx.Output(framework::GradVarName("Bias")); + const Tensor* d_out = ctx.Input(framework::GradVarName("Out")); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + + auto x_mat = EigenMatrix::From(*x); + auto y_mat = EigenMatrix::From(*y); + auto d_out_mat = EigenMatrix::From(*d_out); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + // Create the intermediate variable to caculate the Output(Y@Grad). + Tensor x_scale; + x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), + ctx.GetPlace()); + auto x_scale_mat = EigenMatrix::From(x_scale); + + // Create the intermediate variable to caculate the Output(X@Grad). + Tensor y_scale; + y_scale.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto y_scale_mat = EigenMatrix::From(y_scale); + + math::SetConstant set_zero; + + // Set Output(X@Grad) be zero. + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_x, static_cast(0)); + } + + // Set Output(Y@Grad) be zero. + if (d_y) { + d_y->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_y, static_cast(0)); + } + + // Caculate the Output(X@Grad) and Output(Y@Grad). + if (d_x || d_y) { + Eigen::DSizes bcast_for_x(1, y_dim); + Eigen::DSizes bcast_for_y(1, x_dim); + for (int i = 0; i < out_dim; ++i) { + Tensor weight_i = weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + auto output_vec = d_out_mat.chip(i, 1); + if (d_x) { + y_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_x) * + y_mat; + math::gemm( + dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); + } + if (d_y) { + x_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_y) * + x_mat; + math::gemm( + dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); + } + } + } + + // Caculate the gradient of Input(Weight). + if (d_weight) { + d_weight->mutable_data(ctx.GetPlace()); + Eigen::DSizes bcast_for_weight(1, x_dim); + for (int i = 0; i < out_dim; ++i) { + Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + auto output_vec = d_out_mat.chip(i, 1); + x_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_weight) * + x_mat; + math::gemm(dev_ctx, CblasTrans, CblasNoTrans, x_dim, + y_dim, batch_size, 1, x_scale.data(), + y->data(), 0, d_weight_i.data()); + } + } + + // Caculate the gradient of Input(Bias). + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); + d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d614bf704382da7743b2128fa57a147e8db33d24 --- /dev/null +++ b/paddle/fluid/operators/bipartite_match_op.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class BipartiteMatchOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("DistMat"), + "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); + + auto dims = ctx->GetInputDim("DistMat"); + PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); + + ctx->SetOutputDim("ColToRowMatchIndices", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); + } +}; + +template +class BipartiteMatchKernel : public framework::OpKernel { + public: + // The match_indices must be initialized to -1 at first. + // The match_dist must be initialized to 0 at first. + void BipartiteMatch(const Tensor& dist, int* match_indices, + T* match_dist) const { + constexpr T kEPS = static_cast(1e-6); + PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2."); + int64_t row = dist.dims()[0]; + int64_t col = dist.dims()[1]; + auto* dist_data = dist.data(); + std::vector row_pool; + for (int i = 0; i < row; ++i) { + row_pool.push_back(i); + } + while (row_pool.size() > 0) { + int max_idx = -1; + int max_row_idx = -1; + T max_dist = -1; + for (int64_t j = 0; j < col; ++j) { + if (match_indices[j] != -1) { + continue; + } + for (size_t k = 0; k < row_pool.size(); ++k) { + int m = row_pool[k]; + // distance is 0 between m-th row and j-th column + if (dist_data[m * col + j] < kEPS) { + continue; + } + if (dist_data[m * col + j] > max_dist) { + max_idx = j; + max_row_idx = m; + max_dist = dist_data[m * col + j]; + } + } + } + if (max_idx == -1) { + // Cannot find good match. + break; + } else { + PADDLE_ENFORCE_EQ(match_indices[max_idx], -1); + match_indices[max_idx] = max_row_idx; + match_dist[max_idx] = max_dist; + // Erase the row index. + row_pool.erase( + std::find(row_pool.begin(), row_pool.end(), max_row_idx)); + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* dist_mat = context.Input("DistMat"); + auto* match_indices = context.Output("ColToRowMatchIndices"); + auto* match_dist = context.Output("ColToRowMatchDist"); + + auto& dev_ctx = context.device_context(); + + auto col = dist_mat->dims()[1]; + + int64_t n = dist_mat->lod().size() == 0UL + ? 1 + : static_cast(dist_mat->lod().back().size() - 1); + if (dist_mat->lod().size()) { + PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + match_indices->mutable_data({n, col}, context.GetPlace()); + match_dist->mutable_data({n, col}, context.GetPlace()); + + math::SetConstant iset; + iset(dev_ctx, match_indices, static_cast(-1)); + math::SetConstant tset; + tset(dev_ctx, match_dist, static_cast(0)); + + int* indices = match_indices->data(); + T* dist = match_dist->data(); + if (n == 1) { + BipartiteMatch(*dist_mat, indices, dist); + } else { + auto lod = dist_mat->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]); + BipartiteMatch(one_ins, indices + i * col, dist + i * col); + } + } + } +}; + +class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "DistMat", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[K, M]. It is pair-wise distance matrix between the entities " + "represented by each row and each column. For example, assumed one " + "entity is A with shape [K], another entity is B with shape [M]. The " + "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " + "the distance is, the better macthing the pairs are. Please note, " + "This tensor can contain LoD information to represent a batch of " + "inputs. One instance of this batch can contain different numbers of " + "entities."); + AddOutput("ColToRowMatchIndices", + "(Tensor) A 2-D Tensor with shape [N, M] in int type. " + "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it " + "means B[j] does not match any entity in i-th instance. " + "Otherwise, it means B[j] is matched to row " + "ColToRowMatchIndices[i][j] in i-th instance. The row number of " + "i-th instance is saved in ColToRowMatchIndices[i][j]."); + AddOutput("ColToRowMatchDist", + "(Tensor) A 2-D Tensor with shape [N, M] in float type. " + "N is batch size. If ColToRowMatchIndices[i][j] is -1, " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchIndices[i][j] = d, and the row offsets of each " + "instance are called LoD. Then " + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); + AddComment(R"DOC( +This operator is a greedy bipartite matching algorithm, which is used to +obtain the matching with the maximum distance based on the input +distance matrix. For input 2D matrix, the bipartite matching algorithm can +find the matched column for each row, also can find the matched row for +each column. And this operator only calculate matched indices from column +to row. For each instance, the number of matched indices is the number of +of columns of the input ditance matrix. + +There are two outputs to save matched indices and distance. +A simple description, this algothrim matched the best (maximum distance) +row entity to the column entity and the matched indices are not duplicated +in each row of ColToRowMatchIndices. If the column entity is not matched +any row entity, set -1 in ColToRowMatchIndices. + +Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. +If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. +If Tensor, the height of ColToRowMatchIndices is 1. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp, + ops::BipartiteMatchOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel, + ops::BipartiteMatchKernel); diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e0fee22d8d828978fe74bd48e46ea6c8063150d --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/box_coder_op.h" + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBoxVar must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + + GetBoxCodeType(ctx->Attrs().Get("code_type")); + + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance."); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the box if the input " + "is image feature map, they are close to the origin of the coordinate " + "system. [xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddOutput( + "OutputBox", + "(LoDTensor or Tensor) " + "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] " + "representing the result of N target boxes encoded/decoded with " + "M Prior boxes and variances."); + + AddComment(R"DOC( +Bounding Box Coder Operator. +Encode/Decode the target bounding box with the priorbox information. +The Encoding schema described below: +ox = (tx - px) / pw / pxv +oy = (ty - py) / ph / pyv +ow = log(abs(tw / pw)) / pwv +oh = log(abs(th / ph)) / phv +The Decoding schema described below: +ox = (pw * pxv * tx * + px) - tw / 2 +oy = (ph * pyv * ty * + py) - th / 2 +ow = exp(pwv * tw) * pw + tw / 2 +oh = exp(phv * th) * ph + th / 2 +where tx, ty, tw, th denote the target box's center coordinates, width and +height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) +center coordinates, width and height. pxv, pyv, pwv, phv denote the variance +of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates, +width and height. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..dd9299ceacdf2507f51f895c71041c1645dd8371 --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.cu @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/box_coder_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = + target_box_data[row_idx * len + 2] - target_box_data[row_idx * len]; + T target_box_height = + target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1]; + + output[idx * len] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[col_idx * len]; + output[idx * len + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / + prior_box_var_data[col_idx * len + 1]; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) / + prior_box_var_data[col_idx * len + 2]; + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) / + prior_box_var_data[col_idx * len + 3]; + } +} + +template +__global__ void DecodeCenterSizeKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int row, + const int col, const int len, + T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = + prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len]; + T prior_box_height = + prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1]; + T prior_box_center_x = + (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; + T prior_box_center_y = (prior_box_data[col_idx * len + 3] + + prior_box_data[col_idx * len + 1]) / + 2; + + T target_box_width = exp(prior_box_var_data[col_idx * len + 2] * + target_box_data[row_idx * len + 2]) * + prior_box_width; + T target_box_height = exp(prior_box_var_data[col_idx * len + 3] * + target_box_data[row_idx * len + 3]) * + prior_box_height; + T target_box_center_x = prior_box_var_data[col_idx * len] * + target_box_data[row_idx * len] * + prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[col_idx * len + 1] * + target_box_data[row_idx * len + 1] * + prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = target_box_center_x + target_box_width / 2; + output[idx * len + 3] = target_box_center_y + target_box_height / 2; + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T* prior_box_data = prior_box->data(); + const T* prior_box_var_data = prior_box_var->data(); + const T* target_box_data = target_box->data(); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c41bcc212b8fcfc4a274a53db4b25161ecdb3fe5 --- /dev/null +++ b/paddle/fluid/operators/box_coder_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string& type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = + target_box_data[i * len + 2] - target_box_data[i * len]; + T target_box_height = + target_box_data[i * len + 3] - target_box_data[i * len + 1]; + + size_t offset = i * col * len + j * len; + output[offset] = (target_box_center_x - prior_box_center_x) / + prior_box_width / prior_box_var_data[j * len]; + output[offset + 1] = (target_box_center_y - prior_box_center_y) / + prior_box_height / prior_box_var_data[j * len + 1]; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)) / + prior_box_var_data[j * len + 2]; + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)) / + prior_box_var_data[j * len + 3]; + } + } + } + void DecodeCenterSize(const framework::Tensor& target_box, + const framework::Tensor& prior_box, + const framework::Tensor& prior_box_var, + T* output) const { + int64_t row = target_box.dims()[0]; + int64_t col = prior_box.dims()[0]; + int64_t len = prior_box.dims()[1]; + + auto* target_box_data = target_box.data(); + auto* prior_box_data = prior_box.data(); + auto* prior_box_var_data = prior_box_var.data(); + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + T prior_box_width = + prior_box_data[j * len + 2] - prior_box_data[j * len]; + T prior_box_height = + prior_box_data[j * len + 3] - prior_box_data[j * len + 1]; + T prior_box_center_x = + (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; + T prior_box_center_y = + (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; + + T target_box_center_x = prior_box_var_data[j * len] * + target_box_data[i * len] * prior_box_width + + prior_box_center_x; + T target_box_center_y = prior_box_var_data[j * len + 1] * + target_box_data[i * len + 1] * + prior_box_height + + prior_box_center_y; + T target_box_width = std::exp(prior_box_var_data[j * len + 2] * + target_box_data[i * len + 2]) * + prior_box_width; + T target_box_height = std::exp(prior_box_var_data[j * len + 3] * + target_box_data[i * len + 3]) * + prior_box_height; + + size_t offset = i * col * len + j * len; + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = target_box_center_x + target_box_width / 2; + output[offset + 3] = target_box_center_y + target_box_height / 2; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + T* output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..364c21f7619910784d63047f3abb3713f1bfd0fc --- /dev/null +++ b/paddle/fluid/operators/cast_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of cast op"); + AddOutput("Out", "The output tensor of cast op"); + AddAttr("out_dtype", "output data type"); + AddAttr("in_dtype", "input data type"); + AddComment(R"DOC( +Cast Operator. + +This Operator casts the input tensor to another data type and +returns tha Output Tensor. + +)DOC"); + } +}; + +class CastOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set"); + PADDLE_ENFORCE(context->HasOutput("Out"), + "The output of cast op must be set"); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CastOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad = new framework::OpDesc(); + grad->SetType("cast"); + grad->SetInput("X", OutputGrad("Out")); + grad->SetOutput("Out", InputGrad("X")); + grad->SetAttr("out_dtype", GetAttr("in_dtype")); + grad->SetAttr("in_dtype", GetAttr("out_dtype")); + return std::unique_ptr(grad); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, + ops::CastOpProtoMaker); +REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb597be9d93af12afb608bf87382c283ddf78e7c --- /dev/null +++ b/paddle/fluid/operators/cast_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" + +template +using CastOpKernel = + paddle::operators::CastOpKernel; + +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel, + CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9ab4961cef4bd6e7d4e592581b51d7d4eb896ec7 --- /dev/null +++ b/paddle/fluid/operators/cast_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +struct CastOpFunctor { + const framework::Tensor* in_; + framework::Tensor* out_; + const DeviceContext& ctx_; + CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const DeviceContext& ctx) + : in_(in), out_(out), ctx_(ctx) {} + + template + void operator()() const { + auto* in_begin = in_->data(); + auto numel = in_->numel(); + auto* in_end = in_begin + numel; + auto* out_begin = out_->mutable_data(ctx_.GetPlace()); + platform::Transform trans; + trans(ctx_, in_begin, in_end, out_begin, + CastOpTransformFunctor()); + } +}; + +template +class CastOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast(context.Attr("out_dtype")), + CastOpFunctor( + in, out, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..080e4d80da4752a0c6bea86c0a9f503cf46e8878 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/chunk_eval_op.h" + +namespace paddle { +namespace operators { + +class ChunkEvalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Inference"), + "Input(Inference) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Precision"), + "Output(Precision) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Recall"), + "Output(Recall) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("F1-Score"), + "Output(F1-Score) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"), + "Output(NumInferChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"), + "Output(NumLabelChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NumCorrectChunks"), + "Output(NumCorrectChunks) of ChunkEvalOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Inference"); + auto label_dim = ctx->GetInputDim("Label"); + + PADDLE_ENFORCE(inference_dim == label_dim, + "Inference's shape must be the same as Label's shape."); + + ctx->SetOutputDim("Precision", {1}); + ctx->SetOutputDim("Recall", {1}); + ctx->SetOutputDim("F1-Score", {1}); + ctx->SetOutputDim("NumInferChunks", {1}); + ctx->SetOutputDim("NumLabelChunks", {1}); + ctx->SetOutputDim("NumCorrectChunks", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(framework::proto::DataType::FP32, + platform::CPUPlace()); + } +}; + +class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Inference", + "(Tensor, default: Tensor). " + "Predictions from the network."); + AddInput("Label", + "(Tensor, default: Tensor). The true tag sequences."); + AddOutput("Precision", + "(float). The evaluated precision (called positive predictive " + "value) of chunks on the given mini-batch."); + AddOutput("Recall", + "(float). The evaluated recall (true positive rate or " + "sensitivity) of chunks on the given mini-batch."); + AddOutput("F1-Score", + "(float). The evaluated F1-Score on the given mini-batch."); + AddOutput("NumInferChunks", + "(int64_t). The number of chunks in Inference on the given " + "mini-batch."); + AddOutput( + "NumLabelChunks", + "(int64_t). The number of chunks in Label on the given mini-batch."); + AddOutput( + "NumCorrectChunks", + "(int64_t). The number of chunks both in Inference and Label on the " + "given mini-batch."); + AddAttr("num_chunk_types", + "(int). The number of chunk type. See below for details."); + AddAttr( + "chunk_scheme", + "(string, default IOB). The labeling scheme indicating " + "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below " + "for details.") + .SetDefault("IOB"); + AddAttr>("excluded_chunk_types", + "(list) A list including chunk type ids " + "indicating chunk types that are not counted. " + "See below for details.") + .SetDefault(std::vector{}); + AddComment(R"DOC( +For some basics of chunking, please refer to +‘Chunking with Support Vector Machines ’. + + +CheckEvalOp computes the precision, recall, and F1-score of chunk detection, +and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. +Here is a NER example of labeling for these tagging schemes: + + Li Ming works at Agricultural Bank of China in Beijing. + IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + +There are three chunk types(named entity types) including PER(person), ORG(organization) +and LOC(LOCATION), and we can see that the labels have the form -. + +Since the calculations actually use label ids rather than labels, extra attention +should be paid when mapping labels to ids to make CheckEvalOp work. The key point +is that the listed equations are satisfied by ids. + + tag_type = label % num_tag_type + chunk_type = label / num_tag_type + +where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` +is the num of chunk types, and `tag_type` get its value from the following table. + + Scheme Begin Inside End Single + plain 0 - - - + IOB 0 1 - - + IOE - 0 1 - + IOBES 0 1 2 3 + +Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, +PER and LOC. To satisfy the above equations, the label map can be like this: + + B-ORG 0 + I-ORG 1 + B-PER 2 + I-PER 3 + B-LOC 4 + I-LOC 5 + O 6 + +It’s not hard to verify the equations noting that the num of chunk types +is 3 and the num of tag types in IOB scheme is 2. For example, the label +id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of +I-LOC is 2, which consistent with the results from the equations. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp, + ops::ChunkEvalOpMaker); +REGISTER_OP_CPU_KERNEL(chunk_eval, + ops::ChunkEvalKernel); diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3dca3d2c0f99c3b2d447ffe4516c1b6c379b13f2 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.h @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class ChunkEvalKernel : public framework::OpKernel { + public: + struct Segment { + int begin; + int end; + int type; + bool operator==(const Segment& y) const { + return begin == y.begin && end == y.end && type == y.type; + } + }; + + void GetSegments(const int64_t* label, int length, + std::vector& segments, int num_chunk_types, + int num_tag_types, int other_chunk_type, int tag_begin, + int tag_inside, int tag_end, int tag_single) const { + segments.clear(); + segments.reserve(length); + int chunk_start = 0; + bool in_chunk = false; + int tag = -1; + int type = other_chunk_type; + for (int i = 0; i < length; ++i) { + int prev_tag = tag; + int prev_type = type; + PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types); + tag = label[i] % num_tag_types; + type = label[i] / num_tag_types; + if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + Segment segment{ + chunk_start, // begin + i - 1, // end + prev_type, + }; + segments.push_back(segment); + in_chunk = false; + } + if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + chunk_start = i; + in_chunk = true; + } + } + if (in_chunk) { + Segment segment{ + chunk_start, // begin + length - 1, // end + type, + }; + segments.push_back(segment); + } + } + + bool ChunkEnd(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return false; + if (type == other_chunk_type) return true; + if (type != prev_type) return true; + if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_end) return true; + if (prev_tag == tag_single) return true; + return false; + } + + bool ChunkBegin(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return type != other_chunk_type; + if (type == other_chunk_type) return false; + if (type != prev_type) return true; + if (tag == tag_begin) return true; + if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_single) return true; + return false; + } + + void Compute(const framework::ExecutionContext& context) const override { + // initialize to parse configurations + int num_chunk_types, num_tag_types; + int other_chunk_type; + int tag_begin, tag_inside, tag_end, tag_single; + std::vector label_segments; + std::vector output_segments; + std::set excluded_chunk_types; + + if (context.Attr("chunk_scheme") == "IOB") { + num_tag_types = 2; + tag_begin = 0; + tag_inside = 1; + tag_end = -1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOE") { + num_tag_types = 2; + tag_begin = -1; + tag_inside = 0; + tag_end = 1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOBES") { + num_tag_types = 4; + tag_begin = 0; + tag_inside = 1; + tag_end = 2; + tag_single = 3; + } else if (context.Attr("chunk_scheme") == "plain") { + num_tag_types = 1; + tag_begin = -1; + tag_inside = -1; + tag_end = -1; + tag_single = -1; + } else { + PADDLE_THROW("Unknown chunk scheme."); + } + other_chunk_type = num_chunk_types = context.Attr("num_chunk_types"); + excluded_chunk_types.insert( + context.Attr>("excluded_chunk_types").begin(), + context.Attr>("excluded_chunk_types").end()); + + auto* inference = context.Input("Inference"); + auto place = inference->place(); + auto* label = context.Input("Label"); + auto* precision = context.Output("Precision"); + auto* recall = context.Output("Recall"); + auto* f1 = context.Output("F1-Score"); + auto* num_infer_chunks = context.Output("NumInferChunks"); + auto* num_label_chunks = context.Output("NumLabelChunks"); + auto* num_correct_chunks = context.Output("NumCorrectChunks"); + + const int64_t* inference_data = inference->data(); + const int64_t* label_data = label->data(); + T* precision_data = precision->mutable_data(place); + T* racall_data = recall->mutable_data(place); + T* f1_data = f1->mutable_data(place); + int64_t* num_infer_chunks_data = + num_infer_chunks->mutable_data(place); + int64_t* num_label_chunks_data = + num_label_chunks->mutable_data(place); + int64_t* num_correct_chunks_data = + num_correct_chunks->mutable_data(place); + *num_infer_chunks_data = 0; + *num_label_chunks_data = 0; + *num_correct_chunks_data = 0; + + auto lod = label->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE(lod == inference->lod(), + "LoD must be same between Inference and Label."); + int num_sequences = lod[0].size() - 1; + for (int i = 0; i < num_sequences; ++i) { + int seq_length = lod[0][i + 1] - lod[0][i]; + EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length, + output_segments, label_segments, *num_infer_chunks_data, + *num_label_chunks_data, *num_correct_chunks_data, + num_chunk_types, num_tag_types, other_chunk_type, tag_begin, + tag_inside, tag_end, tag_single, excluded_chunk_types); + } + *precision_data = !(*num_infer_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_infer_chunks_data); + *racall_data = !(*num_label_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_label_chunks_data); + *f1_data = !(*num_correct_chunks_data) + ? 0 + : 2 * (*precision_data) * (*racall_data) / + ((*precision_data) + (*racall_data)); + } + + void EvalOneSeq(const int64_t* output, const int64_t* label, int length, + std::vector& output_segments, + std::vector& label_segments, + int64_t& num_output_segments, int64_t& num_label_segments, + int64_t& num_correct, int num_chunk_types, int num_tag_types, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single, + const std::set& excluded_chunk_types) const { + GetSegments(output, length, output_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + GetSegments(label, length, label_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + size_t i = 0, j = 0; + while (i < output_segments.size() && j < label_segments.size()) { + if (output_segments[i] == label_segments[j] && + excluded_chunk_types.count(output_segments[i].type) != 1) { + ++num_correct; + } + if (output_segments[i].end < label_segments[j].end) { + ++i; + } else if (output_segments[i].end > label_segments[j].end) { + ++j; + } else { + ++i; + ++j; + } + } + for (auto& segment : label_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments; + } + for (auto& segment : output_segments) { + if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..89df118c06f4df6444fc1f61b5ddde48f6ad8ba7 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +class ClipByNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipByNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipByNormOp should not be null."); + auto max_norm = ctx->Attrs().Get("max_norm"); + PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input of clip_by_norm op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", + "(Tensor) The output of clip_by_norm op with shape as input(X)"); + AddAttr("max_norm", "(float) The maximum norm value."); + AddComment(R"DOC( +ClipByNorm Operator. + +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: + +$$ +Out = \frac{max\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, + ops::ClipByNormOpMaker); +REGISTER_OP_CPU_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a466b335914f1fde6865c6cc375f4ef009632e41 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..82bcf07657bfbf1df6b541b3953285622fb25a87 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ClipByNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max_norm = context.Attr("max_norm"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input); + auto out = EigenVector::Flatten(*output); + auto x_norm = x.square().sum().sqrt(); + auto& place = + *context.template device_context().eigen_device(); + + auto temp = (x_norm <= max_norm).template cast().eval(); + auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..76b2cefbf9dd67c5036b6ecfe35a9d53a54467a9 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" + +namespace paddle { +namespace operators { + +class ClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto max = ctx->Attrs().Get("max"); + auto min = ctx->Attrs().Get("min"); + PADDLE_ENFORCE_LT(min, max, "max should be greater than min."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class ClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor)The input of clip op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)"); + AddAttr( + "min", "(float)Minimum value, under which element is replaced by min."); + AddAttr( + "max", "(float)Maximum value, above which element is replaced by max"); + AddComment(R"DOC( +Clip Operator. + +The clip operator limits the value of given input within an interval. The +interval is specified with arguments 'min' and 'max': + +$$ +Out = \min(\max(X, min), max) +$$ + +)DOC"); + } +}; + +class ClipOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, + ops::ClipOpGrad); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b044d6e699d59dda04fa19468c17faf1e0a0eb7 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aecd6f83bfaf4deab4271e859bea10feecacab62 --- /dev/null +++ b/paddle/fluid/operators/clip_op.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::Transform; + +template +class ClipFunctor { + public: + explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x) const { + if (x < min_) + return min_; + else if (x > max_) + return max_; + else + return x; + } + + private: + T min_; + T max_; +}; + +template +class ClipGradFunctor { + public: + explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x, const T& y) const { + return (y > min_ && y < max_) ? x : 0; + } + + private: + T min_; + T max_; +}; + +template +class ClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + int64_t numel = x->numel(); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); + } +}; + +template +class ClipGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto* x = context.Input("X"); + int64_t numel = d_out->numel(); + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + const T* d_out_data = d_out->data(); + const T* x_data = x->data(); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3414c33b5ab3cc8dffee640fd85b9625b3f237b --- /dev/null +++ b/paddle/fluid/operators/compare_op.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", + string::Sprintf("(LoDTensor) the left hand operand of %s operator", + comment.type)); + AddInput("Y", string::Sprintf( + "(LoDTensor) the right hand operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X and Y, and returns the Out. Each of them is a +N-dim tensor. X and Y could be any type. The each element of the Out tensor is +calculated by %s +)DOC", + comment.type, comment.equation)); + AddAttr("axis", + "(int, default -1). The start dimension index " + "for broadcasting Y onto X.") + .SetDefault(-1) + .EqualGreaterThan(-1); + } +}; + +template +class CompareOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X", + comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y", + comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(), + "The size of dim_y should not be greater than dim_x's."); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CompareOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CompareOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_LOGICAL_OP(less_than, "Out = X < Y"); +REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y"); +REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_OP(equal, "Out = X == Y"); +REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor); diff --git a/paddle/fluid/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3507af2ae3add8cf02f5b9f3b3d89b40d73bcb0d --- /dev/null +++ b/paddle/fluid/operators/compare_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/compare_op.h" + +REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); diff --git a/paddle/fluid/operators/compare_op.h b/paddle/fluid/operators/compare_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4b2ee5a9d68f5f1fd3d2d374669763855659f1db --- /dev/null +++ b/paddle/fluid/operators/compare_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LessThanFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; } +}; + +template +struct LessEqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; } +}; + +template +struct EqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + if (std::is_floating_point::value) { + // This branch will be optimized while compiling if T is integer. It is + // safe to cast a and b to double. + return fabs(static_cast(a - b)) < 1e-8; + } else { + return (a == b); + } + } +}; + +template +class CompareOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + z->mutable_data(context.GetPlace()); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..68eb5412beb02d9dc948eeb188a2d5b1cdb0c5b3 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include + +namespace paddle { +namespace operators { +using framework::Tensor; + +class ConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + "Inputs(X) of ConcatOp should be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ConcatOp should not be null."); + + auto ins = ctx->GetInputsDim("X"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + const size_t n = ins.size(); + + PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1."); + + auto out_dims = ins[0]; + size_t in_zero_dims_size = out_dims.size(); + for (size_t i = 1; i < n; i++) { + for (size_t j = 0; j < in_zero_dims_size; j++) { + if (j == axis) { + out_dims[axis] += ins[i][j]; + } else { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } + } + } + if (out_dims[axis] < 0) { + out_dims[axis] = -1; + } + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input tensors of concat operator.").AsDuplicable(); + AddOutput("Out", "Output tensor of concat operator."); + AddAttr("axis", + "The axis along which the input tensors will be concatenated.") + .SetDefault(0); + AddComment(R"DOC( +Concat Operator. + +Concatenate the input tensors along dimension axis. +Examples: + Input[0] = [[1,2],[3,4]] + Input[1] = [[5,6]] + axis = 0 + Output = [[1,2], + [3,4], + [5,6]] + +)DOC"); + } +}; + +class ConcatOpGrad : public framework::OperatorWithKernel { + public: + ConcatOpGrad(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, + ops::ConcatOpGrad, false) +REGISTER_OP_CPU_KERNEL(concat, + ops::ConcatKernel) +REGISTER_OP_CPU_KERNEL(concat_grad, + ops::ConcatGradKernel) diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..143bda6116775611e399ad805708474621d33b96 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..72b3e225bf64f889804eb5e4fab9df4653f5452b --- /dev/null +++ b/paddle/fluid/operators/concat_op.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +template +class ConcatKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = ins.size(); + size_t output_offset = 0; + out->mutable_data(ctx.GetPlace()); + auto out_stride = framework::stride(out->dims()); + for (size_t i = 0; i < n; i++) { + auto& in = ins[i]; + auto axis_dim = in->dims()[axis]; + auto in_stride = framework::stride(in->dims()); + StridedMemcpy(ctx.device_context(), in->data(), in_stride, + in->dims(), out_stride, out->data() + output_offset); + output_offset += axis_dim * in_stride[axis]; + } + } +}; + +template +class ConcatGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input(framework::GradVarName("Out")); + auto outs = ctx.MultiOutput(framework::GradVarName("X")); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = outs.size(); + size_t input_offset = 0; + auto in_stride = framework::stride(in->dims()); + for (size_t i = 0; i < n; i++) { + auto& out = outs[i]; + out->mutable_data(ctx.GetPlace()); + size_t axis_dim = out->dims()[axis]; + auto out_stride = framework::stride(out->dims()); + StridedMemcpy(ctx.device_context(), in->data() + input_offset, + in_stride, out->dims(), out_stride, out->data()); + input_offset += axis_dim * in_stride[axis]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd93790d5b52a2ccc8358a94f7ead346d384f191 --- /dev/null +++ b/paddle/fluid/operators/cond_op.cc @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cond_op.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +using Scope = framework::Scope; +using Variable = framework::Variable; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DDim = framework::DDim; + +framework::Scope& CondOp::AddSubScope(const Scope& scope) const { + auto sub_scopes_var = scope.FindVar("SubScopes"); + PADDLE_ENFORCE_NOT_NULL(sub_scopes_var, + "Output(SubScopes) of CondOp should not be null."); + auto sub_scopes = sub_scopes_var->GetMutable>(); + auto& sub_scope = scope.NewScope(); + sub_scopes->push_back(&sub_scope); + return sub_scope; +} + +std::vector& CondOp::GetSubScopes( + const framework::Scope& scope) const { + auto sub_scopes_var = scope.FindVar("SubScopes"); + PADDLE_ENFORCE_NOT_NULL(sub_scopes_var, + "Output(SubScopes) of CondOp should not be null."); + return *sub_scopes_var->GetMutable>(); +} + +LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const { + auto index_tensors_var = scope.FindVar("IndexTensors"); + PADDLE_ENFORCE_NOT_NULL(index_tensors_var, + "Output(IndexTensors) of CondOp should not be null."); + auto& index_tensors = + *index_tensors_var->GetMutable>(); + index_tensors.push_back(LoDTensor()); + return index_tensors.back(); +} + +std::vector& CondOp::GetIndexTensors( + const framework::Scope& scope) const { + auto* index_tensors_var = scope.FindVar("IndexTensors"); + PADDLE_ENFORCE_NOT_NULL(index_tensors_var, + "Output(IndexTensors) of CondOp should not be null."); + return *index_tensors_var->GetMutable>(); +} + +void CondOp::PrepareDataForSubnet( + const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty."); + + for (int i = 0; i < BRANCH_NUM; ++i) { + // Create two sub scopes for true and false branches + // sub_scopes[0] for the true branch + // sub_scopes[1] for the false branch + AddSubScope(scope); + // Create two tensors for true and false indices: + // index_tensors[0] for the true branch + // index_tensors[1] for the false branch + AddIndexTensor(scope); + } + + Variable* cond_var = scope.FindVar(Input("Cond")); + PADDLE_ENFORCE_NOT_NULL(cond_var, + "Input(Cond) of CondOp should not be null."); + const LoDTensor* cond = cond_var->GetMutable(); + + // get the true/false index at runtime according to cond tensor + // index_vectors[0]: vector, contains all index for cond[i] == true + // index_vectors[1]: vector, contains all index for cond[i] == false + std::vector> index_vectors; + index_vectors.resize(BRANCH_NUM); + + const int* cond_data = cond->data(); + for (int i = 0; i < cond->dims()[0]; ++i) { + if (cond_data[i]) + index_vectors[TRUE_BRANCH].push_back(i); + else + index_vectors[FALSE_BRANCH].push_back(i); + } + + // put index_vectors[0] and index_vectors[1] into two tensors: + // index_tensors[0] and index_tensors[1] + std::vector& index_tensors = GetIndexTensors(scope); + std::vector& sub_scopes = GetSubScopes(scope); + + for (int i = 0; i < BRANCH_NUM; ++i) { + DDim dim = {static_cast(index_vectors[i].size())}; + int* index_tensor_data_ptr = + index_tensors[i].mutable_data(dim, platform::CPUPlace()); + memcpy(index_tensor_data_ptr, index_vectors[i].data(), + dim[0] * sizeof(int)); + } + + // create input in subscopes according to index_vectors + for (auto& input : Inputs("Xs")) { + Variable* var_parent = scope.FindVar(input); + PADDLE_ENFORCE_NOT_NULL(var_parent); + const auto* tensor_parent = &var_parent->Get(); + + for (int i = 0; i < BRANCH_NUM; ++i) { + Variable* var_child = sub_scopes[i]->FindVar(input); + PADDLE_ENFORCE_NOT_NULL(var_child); + auto* tensor_child = var_child->GetMutable(); + + // Resize child + DDim dim = tensor_parent->dims(); + dim[0] = index_tensors[i].dims()[0]; + tensor_child->mutable_data(dim, platform::CPUPlace()); + + CPUGather(dev_ctx, *tensor_parent, index_tensors[i], tensor_child); + } + } + + // create output_tensors in subscope for sub_net + for (int i = 0; i < BRANCH_NUM; ++i) { + for (auto& output : (*sub_net_op_[i]).Outputs()) { + for (auto& var_name : output.second) { + sub_scopes[i]->Var(var_name); + } + } + } +} + +void CondOp::MergeDataFromSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + std::vector& sub_scopes = GetSubScopes(scope); + const std::vector& index_tensors = + GetIndexTensors(scope); + + // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0] + PADDLE_ENFORCE(!Outputs("Outs").empty(), + "Outputs(Outs) of CondOp can't be empty."); + for (auto& output : Outputs("Outs")) { + const LoDTensor* tensor_t_out = + &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get(); + PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL"); + const LoDTensor* tensor_f_out = + &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get(); + PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL"); + + auto* var_out = scope.FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found"); + LoDTensor* tensor_out = var_out->GetMutable(); + PADDLE_ENFORCE_NOT_NULL(tensor_t_out, + "True output tensor should not be NULL"); + + DDim true_dim = tensor_t_out->dims(); + DDim false_dim = tensor_f_out->dims(); + true_dim[0] = 0; + false_dim[0] = 0; + PADDLE_ENFORCE_EQ(true_dim, false_dim, + "Outputs not of the same shape except the first dim"); + + DDim out_dim = tensor_t_out->dims(); + out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0]; + tensor_out->Resize(out_dim); + tensor_out->mutable_data(platform::CPUPlace()); + } + + // merge output results: + // output_tensor = true_output_tensor + false_output_tensor + for (auto& output : Outputs("Outs")) { + Variable* var_parent = scope.FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_parent); + auto* tensor_parent = var_parent->GetMutable(); + + for (int i = 0; i < BRANCH_NUM; ++i) { + Variable* var_child = sub_scopes[i]->FindVar(output); + PADDLE_ENFORCE_NOT_NULL(var_child); + auto* tensor_child = &var_child->Get(); + ScatterAssign(dev_ctx, *tensor_child, index_tensors[i], + tensor_parent); + } + } +} + +void CondOp::Run(const Scope& scope, const platform::Place& place) const { + // get device context from pool + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); + + PrepareDataForSubnet(scope, dev_ctx); + std::vector& sub_scopes = GetSubScopes(scope); + for (int i = 0; i < BRANCH_NUM; ++i) { + sub_net_op_[i]->Run(*sub_scopes[i], place); + } + MergeDataFromSubnet(scope, dev_ctx); +} + +class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { + public: + CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Cond", "The condition, which is a bool vector"); + AddInput("Xs", "Inputs of Subnets").AsDuplicable(); + AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable(); + + AddOutput("SubScopes", "sub scopes for true and false branches"); + AddOutput("IndexTensors", "Index Tensors contains indices for true/false"); + + AddComment(R"DOC( +Sample Dependent Conditional Operator. + +Given Cond[i] as a 1/0 vector to indicate true/false: +Out[i] = subnet_true[i], if Cond[i] == true +Out[i] = subnet_false[i], if Cond[i] == false + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp, + paddle::operators::CondOpProtoAndCheckerMaker); diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h new file mode 100644 index 0000000000000000000000000000000000000000..695af4490696b29d2d47f5825ebc0159b39663c0 --- /dev/null +++ b/paddle/fluid/operators/cond_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +/* + * @brief CondOp is a dynamic if-else Operator + * + * It has a input tensor named cond indicating which netop each instance will + * run. + * + * if cond == 1, it will run true_net, which is a NetOp. + * + * if cond == 0, it will run false_net, which is another NetOp. + */ +class CondOp : public framework::OperatorBase { + public: + CondOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) { + sub_net_op_.resize(BRANCH_NUM); + } + + CondOp(const CondOp& o) + : framework::OperatorBase( + static_cast(o)) { + // TODO(yuyang18): Implement copy ctor well. + PADDLE_THROW("Not implemented"); + } + + framework::Scope& AddSubScope(const framework::Scope& scope) const; + std::vector& GetSubScopes( + const framework::Scope& scope) const; + + framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const; + std::vector& GetIndexTensors( + const framework::Scope& scope) const; + + void PrepareDataForSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + void MergeDataFromSubnet(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const; + + /* + * Set True Block + */ + void set_truenet(std::unique_ptr&& net) { + sub_net_op_[TRUE_BRANCH] = std::move(net); + } + + /* + * Set False Block + */ + void set_falsenet(std::unique_ptr&& net) { + sub_net_op_[FALSE_BRANCH] = std::move(net); + } + + void Run(const framework::Scope& scope, + const platform::Place& place) const override; + + private: + const int TRUE_BRANCH = 0; + const int FALSE_BRANCH = 1; + const int BRANCH_NUM = 2; + + // sub_net_op_[0]: subnet_t + // sub_net_op_[1]: subnet_f + std::vector> sub_net_op_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..30435c6cca0a4fb1d41dce47b8fefeafb6c48a51 --- /dev/null +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ConditionalOp : public framework::OperatorBase { + public: + ConditionalOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + std::vector InputTensors( + const framework::Scope &scope) const { + std::vector retv; + auto xs = Inputs("X"); + retv.resize(xs.size(), nullptr); + std::transform( + xs.begin(), xs.end(), retv.begin(), + [&scope](const std::string &var_name) -> const framework::LoDTensor * { + auto *var = scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name); + return &var->Get(); + }); + return retv; + } + + bool ScalarCondition( + const std::vector &ips) const { + if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { + PADDLE_THROW("should have one initialized input as condition"); + } + if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() && + ips[0]->numel() == 1)) { + PADDLE_THROW( + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); + } + return ips[0]->data()[0]; + } +}; + +class ConditionalBlockOp : public ConditionalOp { + public: + ConditionalBlockOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto xs = InputTensors(scope); + + bool need_run; + if (Attr("is_scalar_condition")) { + need_run = ScalarCondition(xs); + } else { + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Output("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto *scopes = scope_var->GetMutable>(); + scopes->resize(1); + scopes->front() = &scope.NewScope(); + auto &cur_scope = *scopes->front(); + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + } + } +}; + +class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The conditional variable of this operator. If X is empty, the " + "whole sub-block will not be executed.") + .AsDuplicable(); + AddInput("Params", "The input variables of the sub-block.").AsDuplicable(); + AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); + AddOutput("Scope", + "(std::vector) The step scope of conditional block. To " + "unify the conditional block, rnn and while op, the type of " + "scope is std::vector"); + AddAttr( + "sub_block", "The step block of conditional block operator"); + AddAttr("is_scalar_condition", + "the input X is used as scalar " + "condition") + .SetDefault(false); + AddComment(R"DOC(Conditional block operator + +Run the sub-block if X is not empty. Params is the other inputs and Out is the +outputs of the sub-block. +)DOC"); + } +}; + +class ConditionalBlockGradOp : public ConditionalOp { + public: + ConditionalBlockGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto xs = this->InputTensors(scope); + + bool need_run; + if (Attr("is_scalar_condition")) { + need_run = ScalarCondition(xs); + } else { + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Input("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto &scopes = scope_var->Get>(); + framework::Scope &cur_scope = *scopes[0]; + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"), + Outputs(framework::GradVarName("Params"))); + + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"), + Outputs(framework::GradVarName("X"))); + } + } + + private: + void AssignLocalGradientToGlobal( + const platform::Place &place, const framework::Scope &cur_scope, + const std::vector &p_names, + const std::vector &pg_names) const { + for (size_t i = 0; i < p_names.size(); ++i) { + auto out_grad_name = pg_names[i]; + auto in_grad_name = framework::GradVarName(p_names[i]); + auto *in_var = cur_scope.FindVar(in_grad_name); + if (in_var == nullptr) { + continue; + } + auto new_in_grad_name = cur_scope.Rename(in_grad_name); + auto assign = framework::OpRegistry::CreateOp( + "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, + framework::AttributeMap{}); + assign->Run(cur_scope, place); + cur_scope.Rename(new_in_grad_name, in_grad_name); + } + } +}; + +class ConditionalBlockGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInputs("X")); + if (context->HasInputs("Params")) { + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params"))); + context->SetOutputsDim(framework::GradVarName("Params"), + context->GetInputsDim("Params")); + } + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X"))); + context->SetOutputsDim(framework::GradVarName("X"), + context->GetInputsDim("X")); + } +}; + +class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad_op = new framework::OpDesc(); + grad_op->SetType("conditional_block_grad"); + grad_op->SetInput("X", Input("X")); + grad_op->SetInput("Params", Input("Params")); + grad_op->SetInput("Out", Output("Out")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetInput("Scope", Output("Scope")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + grad_op->SetOutput(framework::GradVarName("Params"), + InputGrad("Params", false)); + grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]); + grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp, + ops::ConditionalBlockOpProtoMaker, + ops::ConditionalBlockGradMaker); +REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp, + ops::ConditionalBlockGradInferShape); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..a729d376ac8c3dc49ec06271c3ffef6406a20b28 --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; + +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = + static_cast(1024) * 1024 * 1024; + +template +class CUDNNConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + + int input_channels = input->dims()[1]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + int output_channels = filter->dims()[0]; + int output_height, output_width, output_depth; + if (output->dims().size() == 5) { + output_depth = output->dims()[2]; + output_height = output->dims()[3]; + output_width = output->dims()[4]; + } else { + output_depth = 1; + output_height = output->dims()[2]; + output_width = output->dims()[3]; + } + + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; + int group_offset_out = + output_channels / groups * output_height * output_width * output_depth; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo; + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + // get workspace size able to allocate + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv forward --------------------- + T alpha = 1.0f, beta = 0.0f; + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CUDNNConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_grad_desc; + + ScopedFilterDescriptor filter_desc; + ScopedFilterDescriptor filter_grad_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + +#if CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it mannually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + groups = 1; +#endif + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + cudnnTensorDescriptor_t cudnn_output_grad_desc = + output_grad_desc.descriptor( + layout, framework::vectorize2int(output_grad->dims()), groups); + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + + int input_channels = input->dims()[1]; + int input_height, input_width, input_depth; + if (input->dims().size() == 5) { + input_depth = input->dims()[2]; + input_height = input->dims()[3]; + input_width = input->dims()[4]; + } else { // dim size is enforced in InferShape + input_depth = 1; + input_height = input->dims()[2]; + input_width = input->dims()[3]; + } + + int output_grad_channels = filter->dims()[0]; + int output_grad_height, output_grad_width, output_grad_depth; + if (input->dims().size() == 5) { + output_grad_depth = output_grad->dims()[2]; + output_grad_height = output_grad->dims()[3]; + output_grad_width = output_grad->dims()[4]; + } else { + output_grad_depth = 1; + output_grad_height = output_grad->dims()[2]; + output_grad_width = output_grad->dims()[3]; + } + + int group_offset_in = + input_channels / groups * input_height * input_width * input_depth; + int group_offset_out = output_grad_channels / groups * output_grad_height * + output_grad_width * output_grad_depth; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t workspace_size_in_bytes = 0, tmp_size = 0; + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + if (input_grad) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, + // dyDesc: Handle to the previously initialized input differential + // tensor descriptor. + cudnn_output_grad_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_input_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_output_grad_desc, + cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + + if (filter_grad) { + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + i * group_offset_in)); + } + } + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. + for (int i = 0; i < groups; i++) { + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_output_grad_desc, output_grad_data + i * group_offset_out, + cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_desc, + filter_grad_data + i * group_offset_filter)); + } + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); + +REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a047e579163cfe9cd0d053f337b7a92339466a96 --- /dev/null +++ b/paddle/fluid/operators/conv_op.cc @@ -0,0 +1,354 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +namespace paddle { +namespace operators { + +void ConvOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "Conv intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + "Conv input dimension and filter dimension should be the same."); + PADDLE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "Conv input dimension and strides dimension should be consistent."); + PADDLE_ENFORCE_EQ( + paddings.size(), strides.size(), + "Conv paddings dimension and Conv strides dimension should be the same."); + + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + "The number of output channels should be divided by groups."); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] - + (dilations[i] * (filter_dims[i + 2] - 1) + 1) > + 0, + "Due to the settings of paddings, filter_dims and " + "dilations, the output size is less than 0, please check " + "again."); + output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + ctx->ShareLoD("Input", "Output"); +} + +framework::OpKernelType ConvOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "H is the height of the filter, and W is the width of the filter. " + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0}), the " + "paddings(h_pad, w_pad) of " + "convolution operator.") + .SetDefault({0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and Output(Output) are in NCHW format. Where N is batch +size, C is the number of channels, H is the height of the feature, and W is +the width of the feature. +Filters(Input) is MCHW format. Where M is the number of output image channels, C is +the number of input image channels, H is the height of the filter, and W +is the width of the filter. +Parameters(strides, paddings, dilations) are two elements. These two elements represent +height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ +)DOC"); +} + +Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCDHW. Where N is batch size, C is the " + "number of channels, D is the depth of the feature, H is the height of " + "the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCDHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "D is the depth of the filter, H is the height of the filter, and W " + "is the width of the filter." + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator." + "The format of output tensor is also NCDHW."); + AddAttr>("strides", + "(vector, default:{1, 1, 1}), the " + "strides(d_stride, h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector, default:{0, 0, 0}), the " + "paddings(d_pad, h_pad, w_pad) of convolution " + "operator.") + .SetDefault({0, 0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation, h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + + AddComment(R"DOC( +Convolution3D Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format, where N is batch +size, C is the number of channels,D is the depth of the feature, H is the height of +the feature, and W is the width of the feature. +Filters(Input) is MCDHW format, where M is the number of output image channels, +C is the number of input image channels, D is the depth of the filter, +H is the height of the filter, and W is the width of the filter. +Parameters(strides, paddings, dilations) are three elements. These three elements +represent depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\ + H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1 + $$ +)DOC"); +} + +void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, + ops::ConvOpGrad); + +// depthwise convolution op +REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + depthwise_conv2d_grad, ops::ConvOpGrad); +REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, + ops::ConvOpGrad); + +// depthwise conv kernel +// TODO(xingzhaolong): neon kernel for mobile +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d, + ops::GemmConvKernel, + ops::GemmConvKernel); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2129d3b461249b5d1b317edde924ffc04f4f90f --- /dev/null +++ b/paddle/fluid/operators/conv_op.cu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvKernel, + ops::DepthwiseConvKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad, + ops::DepthwiseConvGradKernel, + ops::DepthwiseConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1156e6c8fe3263607d4dcd1af0c9996acd9368fb --- /dev/null +++ b/paddle/fluid/operators/conv_op.h @@ -0,0 +1,422 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// Base convolution operator definations for other conv +// like operators to reuse the implementation. +inline int OutputSize(int input_size, int filter_size, int dilation, + int padding, int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + const int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + return output_size; +} +inline bool IsExpand(std::vector& filter_dim, + std::vector& strides, std::vector& paddings, + std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +// Define Op classes in .h file so that other conv +// operator implementations can reuse the code. +class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class ConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * + // o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape, context.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + auto& dev_ctx = context.template device_context(); + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmul(dev_ctx, filter_slice, false, col_matrix, + false, T(1.0), &out_slice, T(0.0)); + } + } + } +}; + +template +class GemmConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output_grad->dims()[1], + output_grad->numel() / + (output_grad->dims()[0] * output_grad->dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output_grad->dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape, context.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, input_grad, static_cast(0)); + } + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + Tensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + math::matmul(dev_ctx, filter_slice, true, + out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + math::matmul(dev_ctx, out_grad_slice, false, + col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); + } + } + } + } +}; + +template +class DepthwiseConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_EQ( + output->dims()[1] % input->dims()[1], 0, + "The output channels must be a multiple of the input channels"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::DepthwiseConvFunctor depthwiseConv; + + auto& dev_ctx = context.template device_context(); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + } +}; + +template +class DepthwiseConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, input_grad); + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, + filter_grad); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a96aac63e09b47c9afe99b2e622a718839ba047c --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/framework/eigen.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +class ConvShiftOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The 1st dimension of Input(X) and Input(Y) should " + "be equal."); + PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1, + "The 2nd dimension of Input(Y) should be odd."); + PADDLE_ENFORCE_LE(y_dims[1], x_dims[1], + "The 2nd dimension of Input(Y) should be less than or " + "equal to the 2nd dimension of Input(X)."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ConvShiftGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "where B is the batch size and M is the data dimension."); + AddInput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape B x N, " + "where B is the batch size and N is the data dimension. N must " + "be odd."); + AddOutput("Out", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "i.e., the same shape as X."); + AddComment(R"DOC( +ConvShift Operator. + +A layer for circular convolution of two vectors, +as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401 + +The equation is: + +$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$ + +where X's index is computed modulo M, and Y's index is computed modulo N. + +Both inputs X and Y can carry LoD (Level of Details) information. +However, the output only shares the LoD information with input X. + +)DOC"); + } +}; + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto out = EigenMatrix::From(*Out); + out.setZero(); + + size_t batch_size = X->dims()[0]; + size_t x_width = X->dims()[1]; + size_t y_width = Y->dims()[1]; + size_t y_half_width = (y_width - 1) / 2; + + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + out(k, i) += x(k, index) * y(k, j); + } + } + } + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *dOut = context.Input(framework::GradVarName("Out")); + auto *dX = context.Output(framework::GradVarName("X")); + auto *dY = context.Output(framework::GradVarName("Y")); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto dout = EigenMatrix::From(*dOut); + + auto x_dims = X->dims(); + auto y_dims = Y->dims(); + size_t batch_size = x_dims[0]; + size_t x_width = x_dims[1]; + size_t y_width = y_dims[1]; + size_t y_half_width = (y_width - 1) / 2; + + // The below trades code duplication for efficiency (keeping the if + // statement outside of the loop). + if (dX) { + dX->mutable_data(context.GetPlace()); + auto dx = EigenMatrix::From(*dX); + dx.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dx(k, index) += dout(k, i) * y(k, j); + } + } + } + } + + if (dY) { + dY->mutable_data(context.GetPlace()); + auto dy = EigenMatrix::From(*dY); + dy.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dy(k, j) += x(k, index) * dout(k, i); + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker, + conv_shift_grad, ops::ConvShiftGradOp); +REGISTER_OP_CPU_KERNEL(conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CPU_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9818707ce3b98afe25050336d85e3b05919620f3 --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -0,0 +1,197 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Some notes on the design: +// +// Each thread is responsible for computing a single output out[k, i]. +// Thread blocks are based on tiles of x with height 1 in the batch dimension. +// +// This design is based on the typical use case where the filter +// y is fairly small. For large y, it would probably be more efficient +// to also tile across y. +template +__global__ void ConvShiftForward(const T *x, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *out) { + extern __shared__ T mem[]; + + int tx = threadIdx.x; + int i = blockIdx.x * blockDim.x + tx; // global x index + int k = blockIdx.y; // batch index + + // Check if we are in a boundary block with fewer x's to process than + // blockDim.x. + int num_x = + (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x; + + T *sx = mem; + T *sx_pad = &mem[num_x]; + T *sy = &mem[blockDim.x + y_width]; + + // Collaboratively load y[k, :] and length-y padding of x into shared memory. + int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width; + for (int j = tx; j < y_width; j += blockDim.x) { + sy[j] = y[k * y_width + j]; + sx_pad[j] = x[k * x_width + (pad_start + j) % x_width]; + } + + // Load a cyclically shifted slice of x into shared memory. + if (tx < num_x) { + int load_i = (i - y_half_width + x_width) % x_width; + sx[tx] = x[k * x_width + load_i]; + } + __syncthreads(); + + if (tx < num_x) { + // Compute dot product of sx[tx:tx + y_width] and sy. + T sum = 0; + for (int j = 0; j < y_width; ++j) { + sum += sx[tx + j] * sy[j]; + } + + // Save to out[k, i]. + out[k * x_width + i] = sum; + } +} + +// Compute x gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *dx) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dx[k * x_width + index], + dout[k * x_width + i] * y[k * y_width + j]); + } +} + +// Compute y gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, + int y_half_width, int batch_size, T *dy) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dy[k * y_width + j], + x[k * x_width + index] * dout[k * x_width + i]); + } +} +} // namespace + +template +class ConvShiftKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + Tensor *Out = context.Output("Out"); + const T *x_data = X->data(); + const T *y_data = Y->data(); + T *out_data = Out->mutable_data(context.GetPlace()); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); + + dim3 grid_dim(num_x_blocks, batch_size); + + auto stream = + context.template device_context().stream(); + + ConvShiftForward<<>>( + x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + const T *x_data = X->data(); + const T *y_data = Y->data(); + const T *dout_data = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dY = context.Output(framework::GradVarName("Y")); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + dim3 grid_dim(num_x_blocks, y_width, batch_size); + + if (dX) { + T *dx_data = dX->mutable_data(context.GetPlace()); + zero(device_ctx, dX, static_cast(0.0)); + ConvShiftGradX<<>>( + dout_data, y_data, x_width, y_width, y_half_width, batch_size, + dx_data); + } + if (dY) { + T *dy_data = dY->mutable_data(context.GetPlace()); + zero(device_ctx, dY, static_cast(0.0)); + ConvShiftDy<<>>( + x_data, dout_data, x_width, y_width, y_half_width, batch_size, + dy_data); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h new file mode 100644 index 0000000000000000000000000000000000000000..987a690895e2a7428f058eb2d8366f9c7572912b --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class ConvShiftGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0aed4ebeffa7312c218bb892fbcdf9cd9cdc53ca --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -0,0 +1,251 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; + +static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024; + +template +class CUDNNConvTransposeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + // (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims())); + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn conv workspace --------------------- + void* cudnn_workspace = nullptr; + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t algo; + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + // Get the algorithm + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + + // get workspace size able to allocate + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + + // Allocate on GPU memory + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + + // ------------------- cudnn conv transpose forward --------------------- + T alpha = 1.0f, beta = 0.0f; + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc, + input_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +template +class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + // Input: (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output_grad->dims())); + // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims())); + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionFwdAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t bwd_filter_ws_size, fwd_ws_size; + size_t workspace_size_in_bytes = 0; + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + if (input_grad) { + // choose backward algorithm for data + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, data_algo, &fwd_ws_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); + } + + if (filter_grad) { + // choose backward algorithm for filter + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + // get workspace for backwards filter algorithm + PADDLE_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); + workspace_size_in_bytes = + std::max(workspace_size_in_bytes, bwd_filter_ws_size); + } + + // ------------------- cudnn conv workspace --------------------- + // Already on GPU + void* cudnn_workspace = nullptr; + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- + // FIXME(typhoonzero): template type T may not be the same as cudnn call. + T alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, output_grad_data, + cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data)); + } + + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. + // Gradient with respect to the filter + PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, + input_data, cudnn_conv_desc, filter_algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data)); + } + // Release the cudnn workspace + paddle::memory::Free(gpu, cudnn_workspace); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); + +REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..974cffad92871c1a855c86a7a2e56f8e65819428 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -0,0 +1,323 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" + +namespace paddle { +namespace operators { + +void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvTransposeOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], + "In ConvTransposeOp, The input channel should be the same " + "as the number of filters."); + + std::vector output_shape({in_dims[0], filter_dims[1]}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + + filter_extent); + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); +} + +framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Input", + "(Tensor) The input tensor of convolution transpose operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of input channels, H is the height of the feature, and " + "W is the width of the feature."); + AddInput( + "Filter", + "(Tensor) The filter tensor of convolution transpose operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels," + "H is the height of the filter, and W is the width of the filter. " + "We enforce groups number == 1 in the convolution transpose scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator. " + "The format of output tensor is also NCHW."); + + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "strides", + "(vector default:{1, 1}), the strides(h_stride, w_stride) of " + "convolution transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "paddings", + "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " + "transpose operator.") + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution2D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Filter(Input) is in MCHW format. Where M is the number of input feature channels, +C is the number of output feature channels, H is the height of the filter, +and W is the width of the filter. +Parameters(strides, paddings) are two elements. These two elements represent height +and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 + $$ +)DOC"); +} + +Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor) The input tensor of convolution transpose operator." + "The format of input tensor is NCDHW. Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and " + "W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution transpose operator." + "The format of the filter tensor is MCDHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels, D " + "is the depth of the filter, H is the height of the filter, and " + "W is the width of the filter." + "We enforce groups number == 1 and padding == 0 in " + "the convolution3d transpose scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator." + "The format of output tensor is also NCDHW." + "Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and W is the width of the feature."); + + AddAttr>( + "dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation,h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("strides", + "(vector default:{1, 1, 1}), the " + "strides{d_stride, h_stride, w_stride} of " + "convolution transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0, 0}), paddings(d_pad, " + "h_pad, w_pad) of convolution transpose operator.") + .SetDefault({0, 0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(4096); + AddComment(R"DOC( +Convolution3D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the +number of channels, D is the depth of the feature, H is the height of the feature, +and W is the width of the feature. +Filter(Input) is in MCDHW format. Where M is the number of input feature channels, +C is the number of output feature channels, D is the depth of the filter,H is the +height of the filter, and W is the width of the filter. +Parameters(strides, paddings) are three elements. These three elements represent +depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\ + H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + $$ +)DOC"); +} + +void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), + layout_, library_); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, + conv2d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, + conv3d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed90c6ec6265cc914a172b6c7217a204981e7fd1 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cu.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f512575468626edfb3e36c007e26b05faff0a06d --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +// Define Op classes in .h file so that other conv transpose +// operator implementations can reuse the code. +class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class ConvTransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvTransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped, so it should not be constant pointer + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + // groups will alway be disabled in conv2dtranspose. + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output->dims()[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = + framework::slice_ddim(output->dims(), 1, output->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + filter.Resize(filter_matrix_shape); + + output->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, output, static_cast(0)); + + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + + // convolution transpose: gemm + col2im or col2vol (similar to conv-backward + // on input) + for (int i = 0; i < batch_size; i++) { + // batch with size (m, h * w) or (m, d * h * w) + Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); + + // col_matrix = filter * input_batch + // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + math::matmul(dev_ctx, filter, true, input_batch, false, + static_cast(1.0), &col_matrix, + static_cast(0.0)); + + if (data_dim == 2U) { + // col2im: col_matrix -> dy + // from (c * k_h * k_w, h * w) to (c, o_h, o_w) + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &output_batch); + } else if (data_dim == 3U) { + // col2vol: col_matrix -> dy + // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) + col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); + } + } + } +}; + +template +class GemmConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + // For filter, we do not use const pointer b/c we will do reshape, + // but we should avoid modifying its value. + Tensor filter = *context.Input("Filter"); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + + if ((!input_grad) && (!filter_grad)) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output_grad->dims()[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = framework::slice_ddim(output_grad->dims(), 1, + output_grad->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + filter.Resize(filter_matrix_shape); + + // convolution transpose grad on input: + // im2col + gemm (similar to conv-forward) + // input need to compute gradient + auto& dev_ctx = context.template device_context(); + if (input_grad || filter_grad) { + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + Tensor filter_grad_; + math::SetConstant set_zero; + + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + } + if (filter_grad) { // filter size (m, c, k_h, k_w) + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + } + + for (int i = 0; i < batch_size; i++) { + // batch with size (c, o_h * o_w) + Tensor output_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_shape); + + if (data_dim == 2U) { + // im2col: dy -> col matrix + // from (c, o_h, o_w) to (c * k_h * k_w, h * w) + im2col(dev_ctx, output_grad_batch, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col: dy -> col_matrix + // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); + } + + if (input_grad) { + // batch with size (m, h, w) + Tensor input_grad_batch = + input_grad->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: dx = filter * dy + // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w) + // or + // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, + // d, h, w) + math::matmul( + dev_ctx, filter, false, col_matrix, false, static_cast(1.0), + &input_grad_batch, static_cast(0.0)); + } + if (filter_grad) { + // input batch + Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: d_filter = x * dy^T + // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w) + // or + // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * + // k_h * k_w) + math::matmul(dev_ctx, in_batch, false, col_matrix, + true, static_cast(1.0), + &filter_grad_, static_cast(1.0)); + } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..57c5a6025a03fbafadb56a3dbec9c4cfab5e979a --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CosSimOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("XNorm"), + "Output(XNorm) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("YNorm"), + "Output(YNorm) of CosSimOp should not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + + // resize tensor + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->SetOutputDim("XNorm", {x_dims[0], 1}); + ctx->SetOutputDim("YNorm", {y_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The 1st input of cos_sim op."); + AddInput("Y", "The 2nd input of cos_sim op."); + AddOutput("Out", "The output of cos_sim op."); + AddOutput("XNorm", + "Norm of the first input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + AddOutput("YNorm", + "Norm of the second input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + + AddComment(R"DOC( +Cosine Similarity Operator. + +$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$ + +The input X and Y must have the same shape, except that the 1st dimension +of input Y could be just 1 (different from input X), which will be +broadcasted to match the shape of input X before computing their cosine +similarity. + +Both the input X and Y can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CosSimOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) must not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto xnorm_dims = ctx->GetInputDim("XNorm"); + auto ynorm_dims = ctx->GetInputDim("YNorm"); + auto out_dims = ctx->GetInputDim("Out"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1}); + auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1}); + PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims, + "Shape of Input(XNorm) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims, + "Shape of Input(YNorm) must be [Y.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims, + "Shape of Input(Out) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims, + "Shape of Input(Out@Grad) must be [X.Dim(0), 1]."); + + // resize tensor + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad, + ops::CosSimOpGrad); +REGISTER_OP_CPU_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..c8cf363cdc4009bd8fa233a52435dcc6ea56cf3c --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd8b196daf6e4afe9bde4d91db0110430cd7324 --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.h @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cos_sim_functor.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CosSimKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* out_z = context.Output("Out"); + auto* out_x_norm = context.Output("XNorm"); + auto* out_y_norm = context.Output("YNorm"); + out_z->mutable_data(context.GetPlace()); + out_x_norm->mutable_data(context.GetPlace()); + out_y_norm->mutable_data(context.GetPlace()); + + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } else { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } + } +}; + +template +class CosSimGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* in_z = context.Input("Out"); + auto* in_x_norm = context.Input("XNorm"); + auto* in_y_norm = context.Input("YNorm"); + auto* out_grad_x = context.Output(framework::GradVarName("X")); + auto* out_grad_y = context.Output(framework::GradVarName("Y")); + auto* in_grad_z = context.Input(framework::GradVarName("Out")); + + // compute gradident + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + if (out_grad_x) { + math::CosSimGradFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + math::CosSimGradFunctor functor( + in_y_norm->data(), in_x_norm->data(), in_y->data(), + in_x->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + } else { + if (out_grad_x) { + math::CosSimDxFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + out_grad_y->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out_grad_y, static_cast(0)); + + math::CosSimDyFunctor functor; + functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), + in_x->data(), in_y->data(), in_z->data(), + in_grad_z->data(), static_cast(rows_x), + static_cast(cols), out_grad_y->data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1ba51f2c0f13a1b6e4d7ccb93c912703a0b1d86 --- /dev/null +++ b/paddle/fluid/operators/create_reader_op.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { + +static std::vector RestoreShapes( + const std::vector& shape_concat, const std::vector& ranks) { + std::vector res; + int offset = 0; + for (int len : ranks) { + auto start_it = shape_concat.begin() + offset; + auto end_it = start_it + len; + res.push_back(framework::make_ddim(std::vector(start_it, end_it))); + offset += len; + } + return res; +} + +// general infershape for file readers +class CreateFileReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output file reader should not be null."); + const auto shape_concat = + ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + if (ctx->IsRuntime()) { + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ( + lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } + } +}; + +// general infershape for decorated readers +class CreateDecoratedReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), + "Input(UnderlyingReader) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output decorated reader should not be null."); + ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); + + if (ctx->IsRuntime()) { + framework::VarDesc* in_reader = boost::get( + ctx->GetInputVarPtrs("UnderlyingReader")[0]); + framework::VarDesc* out_reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + out_reader->SetLoDLevels(in_reader->GetLoDLevels()); + } + } +}; + +// general var type inference for file readers +class CreateFileReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + reader->SetType(framework::proto::VarDesc::READER); + } +}; + +// general var type inference for decorated readers +class CreateDecoratedReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; + framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); + std::string out_reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); + out_reader->SetType(framework::proto::VarDesc::READER); + out_reader->SetDataTypes(in_reader->GetDataTypes()); + } +}; + +template +class CreateRandomDataGeneratorOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& shape_concat = Attr>("shape_concat"); + const auto& ranks = Attr>("ranks"); + PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); + PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), + int(shape_concat.size()), + "The accumulate of all ranks should be equal to the " + "shape concat's length."); + std::vector shapes = RestoreShapes(shape_concat, ranks); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::RandomDataGenerator(shapes, Attr("min"), + Attr("max"))); + } +}; + +class CreateRandomDataGeneratorOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddOutput("Out", "(ReaderHolder) The created random reader."); + AddAttr>("shape_concat", + "The concat of all data's shapes."); + AddAttr>( + "ranks", + "The ranks of each data." + "e.g." + "shape_concat = [2,3,4,5,6]" + "ranks = [3,2]" + "It means the reader will generate two data each time," + "whose shapes are [2,3,4] and [5,6] respectively."); + AddAttr>("lod_levels", "The LoD levels of each data."); + AddAttr("min", "The lower bound of reader's uniform distribution."); + AddAttr("max", "The upper bound of reader's uniform distribution."); + AddComment(R"DOC( + CreateRandomDataGenerator Operator + + This Op creates a random reader. + The reader generates random data instead of really reading from files. + Generated data follow an uniform distribution between 'min' and 'max'. + )DOC"); + } +}; + +class CreateShuffleReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::ShuffleReader(underlying_reader.Get(), + Attr("buffer_size"))); + } +}; + +class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a shuffle reader."); + AddOutput("Out", "(ReaderHolder) The created shuffle reader."); + AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); + AddComment(R"DOC( + CreateShuffleReader Operator + + A shuffle reader takes another reader as its 'underlying reader' + and yields the underlying reader's outputs in a shuffled order. + )DOC"); + } +}; + +class CreateBatchReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new framework::BatchReader(underlying_reader.Get(), + Attr("batch_size"))); + } +}; + +class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput( + "UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a batch reader."); + AddOutput("Out", "(ReaderHolder) The created batch reader."); + AddAttr("batch_size", + "How many instances the batch reader yields each time.") + .GreaterThan(0); + AddComment(R"DOC( + CreateBatchReader Operator + + A batch reader takes another reader as its 'underlying reader', + gathers the underlying reader's outputs and then yields them in batches. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(create_random_data_generator, + ops::CreateRandomDataGeneratorOp, + ops::CreateFileReaderInferShape, + ops::CreateRandomDataGeneratorOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateFileReaderInferVarType); +REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateShuffleReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateDecoratedReaderInferVarType); +REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp, + ops::CreateDecoratedReaderInferShape, + ops::CreateBatchReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::CreateDecoratedReaderInferVarType); diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3c1fc95a3be574635ab8a99aa29b71bd8dbc71e --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crf_decoding_op.h" + +namespace paddle { +namespace operators { +class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Emission", + "(LoDTensor, default: LoDTensor). A LoDTensor with shape " + "[N x D] where N is the size of the mini-batch and D is the total " + "tag number. This input is the unscaled emission weight matrix of " + "the linear_chain_crf operator."); + AddInput( + "Transition", + "(Tensor, default: Tensor). A Tensor with shape [(D + 2) x D]. " + "This input is the transition weights learned by the linear_chain_crf " + "operator, denoted as w. The 1st row of w are transition weights for " + "the start mask. The 2nd row of w are transition weights for the end " + "mask. Transition weights between other tags begin from the 3rd row of " + "w. See more details in comments of the linear_chain_crf operator."); + AddInput( + "Label", + "(LoDTensor, LoDTensor). The ground truth with shape " + "[N x 1]. This input is optional. See more details in the operator's " + "comments.") + .AsDispensable(); + AddOutput( + "ViterbiPath", + "(LoDTensor, LoDTensor). The decoding results. What to " + "return changes depending on whether the Input(Label) (the ground " + "truth) is given. See more details in the operator's comment."); + AddComment(R"DOC( +The crf_decoding operator reads the emission feature weights and the transition +feature weights learned by the linear_chain_crf operator. It implements the +Viterbi algorithm which is a dynamic programming algorithm for finding the most +likely sequence of hidden states, called the Viterbi path, that results in a +sequence of observed tags. + +The output of this operator changes according to whether Input(Label) is given: + +1. Input(Label) is given: + +This happens in training. This operator is used to co-work with the chunk_eval +operator. + +When Input(Label) is given, the crf_decoding operator returns a row vector +with shape [N x 1] whose values are fixed to be 0, indicating an incorrect +prediction, or 1 indicating a tag is correctly predicted. Such an output is the +input to chunk_eval operator. + +2. Input(Label) is not given: + +This is the standard decoding process. + +The crf_decoding operator returns a row vector with shape [N x 1] whose values +range from 0 to maximum tag number - 1. Each element indicates an index of a +predicted tag. +)DOC"); + } +}; + +class CRFDecodingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Emission"), + "Input(Emission) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Transition"), + "Input(Transition) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"), + "Output(ViterbiPath) should be not null."); + + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + "The Input(Transition) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + if (ctx->HasInput("Label")) { + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + } + + ctx->ShareLoD("Emission", /*->*/ "ViterbiPath"); + ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Emission")->type()), + platform::CPUPlace()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, + ops::CRFDecodingOpMaker); +REGISTER_OP_CPU_KERNEL( + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c3c161eec5f541b9f60a36064d7e8c350078c664 --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; +using framework::LoD; +using framework::Tensor; + +template +class CRFDecodingOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* emission_weights = ctx.Input("Emission"); + auto* transition_weights = ctx.Input("Transition"); + auto* label = ctx.Input("Label"); + auto* decoded_path = ctx.Output("ViterbiPath"); + + PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + auto lod = emission_weights->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence."); + const size_t level = 0; + const size_t seq_num = lod[level].size() - 1; + + int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); + Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights, + &decoded_path_one_seq); + } + + if (label) { + PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + const int64_t* label_value = label->data(); + size_t batch_size = emission_weights->dims()[0]; + for (size_t i = 0; i < batch_size; ++i) { + path[i] = label_value[i] == path[i] ? 1 : 0; + } + } + } + + private: + void Decode(const Tensor& emission_weights, const Tensor& transition_weights, + Tensor* decoded_path) const { + auto emission_dims = emission_weights.dims(); + const size_t seq_len = emission_dims[0]; + const size_t tag_num = emission_dims[1]; + + const size_t state_trans_base_idx = 2; + + const T* x = emission_weights.data(); + const T* w = transition_weights.data(); + int64_t* path = decoded_path->data(); + + // alpha is a memo table. An element alpha(k, v) records the score of the + // best sequence of tags from position 1 to position k with v being the end + // tag. + Tensor alpha; + T* alpha_value = alpha.mutable_data(emission_dims, platform::CPUPlace()); + Tensor track; + int* track_value = + track.mutable_data(emission_dims, platform::CPUPlace()); + + for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; + + for (size_t k = 1; k < seq_len; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (size_t j = 0; j < tag_num; ++j) { + T score = alpha_value[(k - 1) * tag_num + j] + + w[(j + state_trans_base_idx) * tag_num + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + + alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; + track_value[k * tag_num + i] = max_j; + } + } + + T max_score = -std::numeric_limits::max(); + int max_i = 0; + for (size_t i = 0; i < tag_num; ++i) { + T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; + if (score > max_score) { + max_score = score; + max_i = i; + } + } + path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; --k) { + path[k - 1] = max_i = track_value[k * tag_num + max_i]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e80f77e497ee21ccd5322b544376f20cb7de012 --- /dev/null +++ b/paddle/fluid/operators/crop_op.cc @@ -0,0 +1,159 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crop_op.h" +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CropOp should not be null."); + auto x_dim = ctx->GetInputDim("X"); + if (!ctx->HasInput("Y")) { + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_EQ( + int64_t(shape.size()), x_dim.size(), + "Shape size should be equal to dimention size of input tensor."); + std::vector tensor_shape(shape.size()); + for (size_t i = 0; i < shape.size(); ++i) { + tensor_shape[i] = static_cast(shape[i]); + } + ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape)); + } else { + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim), + "Tensor rank of both CropOp's " + "inputs must be same."); + ctx->SetOutputDim("Out", y_dim); + } + } +}; + +class CropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CropOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)."); + AddInput("Y", + "The input used as reference for cropping, " + "which is of the same dimensions as X.") + .AsDispensable(); + AddOutput("Out", + "The output of crop op, " + "which is of the same dimensions as X."); + AddAttr>("offsets", + "A list describing offsets to be cropped. " + "The size of offsets list should be the same as " + "the dimension size of input X."); + AddAttr>("shape", + "A list describing the shape of output. " + "The size of shape list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddComment(R"DOC( +Crop Operator. + +Crop input into output, as specified by offsets and shape. + +There are two ways to set shape: +1. reference input: crop input X into the same shape as reference input. + The dimension of reference input should + be the same as the dimension of input X. +2. shape list: crop input X into the shape described by a list. + The size of shape list should be the same as + the dimension size of input X. + +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Case 1: +Given + + X = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + shape = [2, 2], + +we get: + + Out = [[1, 2], + [3, 4]]. + + +Case 2: +Given + + X = [[0, 1, 2, 5, 0] + [0, 3, 4, 6, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + Y = [[0, 0, 0] + [0, 0, 0]], + +we get: + + Out = [[1, 2, 5], + [3, 4, 6]]. +)DOC"); + } +}; + +class CropOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); +REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f3610675aae1572380ca9b778ac3251c1951678b --- /dev/null +++ b/paddle/fluid/operators/crop_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/crop_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9c7c0446d4c0baf1ba59eb860a928341eed7cce0 --- /dev/null +++ b/paddle/fluid/operators/crop_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { // Internal + +template +using EigenTensor = framework::EigenTensor; +using framework::Tensor; + +template +class CropKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + const T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto out_stride = framework::stride(out->dims()); + auto offsets = context.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + x->dims().size(), static_cast(offsets.size()), + "Offsets size should be equal to dimension size of input tensor."); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + StridedMemcpy(context.device_context(), x_data + offset, x_stride, + out->dims(), out_stride, out_data); + } +}; + +template +void CropGradFunction(const framework::ExecutionContext& context) { + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto* d_out = context.Input(framework::GradVarName("Out")); + d_x->mutable_data(context.GetPlace()); + auto offsets = context.Attr>("offsets"); + Eigen::array, D> paddings; + for (size_t i = 0; i < D; ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; + } + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + d_x_tensor.device( + *context.template device_context().eigen_device()) = + d_out_tensor.pad(paddings, 0); + } +} + +template +class CropGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + CropGradFunction(context); + break; + case 2: + CropGradFunction(context); + break; + case 3: + CropGradFunction(context); + break; + case 4: + CropGradFunction(context); + break; + case 5: + CropGradFunction(context); + break; + case 6: + CropGradFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e34b248b6aa696eaa03c7e1b4236a76a9081ef0 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class CrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], + "If Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(label_dims[1], 1UL, + "If Attr(softLabel) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim("Y", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class CrossEntropyGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], + "The 1st dimension of Input(X) and Input(Y@Grad) should " + "be equal."); + PADDLE_ENFORCE_EQ(dy_dims[1], 1, + "The 2nd dimension of Input(Y@Grad) should be 1."); + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1], + "When Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape [N x D]," + " where N is the batch size and D is the number of classes. " + "This input is a probability computed by the previous operator, " + "which is almost always the result of a softmax operator."); + AddInput("Label", + "(Tensor), the ground truth which is a 2-D tensor. When " + "soft_label is set to false, Label is a Tensor with shape " + "[N x 1]. When soft_label is set to true, Label is a " + "Tensor with shape [N x D]."); + AddOutput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape " + "[N x 1]. The cross entropy loss."); + AddAttr("soft_label", + "(bool, default false), a flag indicating whether to " + "interpretate the given labels as soft labels.") + .SetDefault(false); + AddComment(R"DOC( +CrossEntropy Operator. + +It supports both standard cross-entropy and soft-label cross-entropy loss +computation. +1) One-hot cross-entropy: + soft_label = false, Label[i, 0] indicates the class index for sample i: + + $Y[i] = -\log(X[i, Label[i]])$ + +2) Soft-label cross-entropy: + soft_label = true, Label[i, j] indicates the soft label of class j + for sample i: + + $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$ + + Please make sure that in this case the summuation of each row of Label + equals one. + +3) One-hot cross-entropy with vecterized Input(Label): + As a special case of 2), when each row of Input(Label) has only one + non-zero element (equals 1), soft-label cross-entropy degenerates to a + one-hot cross-entropy with one-hot label representation. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, + cross_entropy_grad, ops::CrossEntropyGradientOp); +REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..de0976c69fc65ebc2ef7df9025d6071545e91c33 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, + const int64_t* label, const int N, + const int D) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { + int idx = i * D + label[i]; + dX[idx] = -dY[i] / X[idx]; + } +} + +template +__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, + const T* label, const int N, + const int D) { + int ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < N * D) { + int row_ids = ids / D; + dX[ids] = -label[ids] * dY[row_ids] / X[ids]; + } +} +} // namespace + +template +class CrossEntropyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, label, + ctx.Attr("soft_label")); + } +}; + +template +class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + const Tensor* x = ctx.Input("X"); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + const T* dy_data = + ctx.Input(framework::GradVarName("Y"))->data(); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* x_data = x->data(); + + int64_t batch_size = x->dims()[0]; + int64_t class_num = x->dims()[1]; + + int block = 512; + int grid = (batch_size * class_num + block - 1) / block; + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + if (ctx.Attr("soft_label")) { + auto* label_data = label->data(); + SoftCrossEntropyGradientKernel<<>>( + dx_data, dy_data, x_data, label_data, batch_size, class_num); + } else { + math::SetConstant functor; + functor(dev_ctx, dx, 0); + auto* label_data = label->data(); + grid = (batch_size + block - 1) / block; + CrossEntropyGradientKernel<<>>( + dx_data, dy_data, x_data, label_data, batch_size, class_num); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, + ops::CrossEntropyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpCUDAKernel, + ops::CrossEntropyGradientOpCUDAKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5b20ecb70887dd03865e53f168dad818195b16 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CrossEntropyOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* x = ctx.Input("X"); + const Tensor* labels = ctx.Input("Label"); + Tensor* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, + ctx.Attr("soft_label")); + } +}; + +template +class CrossEntropyGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* x = ctx.Input("X"); + const Tensor* dy = ctx.Input(framework::GradVarName("Y")); + const Tensor* label = ctx.Input("Label"); + Tensor* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + + int64_t class_num = x->dims()[1]; + if (ctx.Attr("soft_label")) { + auto x_mat = EigenMatrix::From(*x); + auto dy_mat = EigenMatrix::From(*dy); + auto lbl_mat = EigenMatrix::From(*label); + auto dx_mat = EigenMatrix::From(*dx); + + dx_mat.device(*ctx.template device_context() + .eigen_device()) = + -(lbl_mat * + dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); + } else { + int64_t batch_size = x->dims()[0]; + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_data = label->data(); + + math::SetConstant functor; + functor(ctx.template device_context(), dx, 0); + + for (int64_t i = 0; i < batch_size; ++i) { + PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); + int64_t index = i * class_num + label_data[i]; + dx_data[index] = -dy_data[i] / x_data[index]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c7db78813e3bdd90d1c65e3af26208ae2a9ba21 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +class CTCAlignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input of CTCAlignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output of CTCAlignOp should not be null."); + + auto input_dims = ctx->GetInputDim("Input"); + + // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Output", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LodTensor, default: LoDTensor), Its shape is " + "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddOutput("Output", "(Tensor, default: Tensor), The align result."); + AddAttr("blank", + "(int, default: 0), the blank label setted in Connectionist " + "Temporal Classification (CTC) op.") + .SetDefault(0); + AddAttr("merge_repeated", + "(bool, default: true), whether to " + "merge repeated elements between two blanks. ") + .SetDefault(true); + AddComment(R"DOC( +CTCAlign op is used to merge repeated elements between two blanks +and then delete all blanks in sequence. + +Given: + Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, + 6, 0, 0, 7, 7, 7, 0] + Input.dims = {18, 1} + Input.LoD = [[0, 11, 18]] + +And: + blank = 0 + merge_repeated = True + +Then: + Output.data = [1, 2, 4, 4, 5, 6, + 6, 7] + Output.dims = {8, 1} + Output.LoD = [[0, 6, 8]] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + ctc_align, ops::CTCAlignKernel, + ops::CTCAlignKernel); diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f629e0a9f15192c4d0d6fa5b8a122811d11ca415 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, + const size_t num_seq, size_t* lod0, + const int blank, const int merge_repeated, + size_t* out_lod0, T* output) { + int ouput_idx = 0; + out_lod0[0] = 0; + + for (int i = 0; i < num_seq; ++i) { + T pre_token = -1; + for (int j = lod0[i]; j < lod0[i + 1]; ++j) { + if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) { + output[ouput_idx] = tokens[j]; + ++ouput_idx; + } + pre_token = tokens[j]; + } + out_lod0[i + 1] = ouput_idx; + } +} + +template +class CTCAlignOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + const size_t level = 0; + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + auto input_lod = framework::ToAbsOffset(input->lod()); + + const T* tokens = input->data(); + const int64_t num_tokens = input->dims()[0]; + const size_t num_seq = input_lod[level].size() - 1; + + const int blank = ctx.Attr("blank"); + const int merge_repeated = + static_cast(ctx.Attr("merge_repeated")); + + // prepare a lod to record lod information while merging elements + thrust::device_vector dev_out_lod0(input_lod[level].size()); + size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); + + // merge elements and delete blank + T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + + auto stream = ctx.cuda_device_context().stream(); + MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( + num_tokens, tokens, num_seq, + input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, + dev_out_lod0_ptr, output_data); + + // set output lod + std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); + framework::LoD out_lod; + out_lod.push_back(host_out_lod0); + output->set_lod(out_lod); + + // resize output dims + output->Resize({static_cast(host_out_lod0.back()), 1}); + + if (host_out_lod0.back() == 0) { + output->Resize({1, 1}); + output->mutable_data(ctx.GetPlace()); + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + output, -1); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel, + paddle::operators::CTCAlignOpCUDAKernel); diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1ef034c2f5b566e3cf720e295953fd7a69dd5812 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class CTCAlignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + const size_t level = 0; + auto input_lod = framework::ToAbsOffset(input->lod()); + + // check input dims and lod + auto input_dims = input->dims(); + PADDLE_ENFORCE_EQ(input_dims[0], + static_cast(input_lod[level].back()), + "The first dimension of Input(Input) should be equal to " + "the sum of all sequences' lengths."); + + const size_t num_sequences = input_lod[level].size() - 1; + size_t blank = static_cast(ctx.Attr("blank")); + bool merge_repeated = ctx.Attr("merge_repeated"); + + // merge repeated tokens and delete blank + T* output_data = output->mutable_data(ctx.GetPlace()); + size_t output_idx = 0; + std::vector output_lod0(1, 0); + const T* input_data = input->data(); + for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; + for (size_t i = input_lod[level][seq_idx]; + i < input_lod[level][seq_idx + 1]; ++i) { + if ((unsigned)input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + output_data[output_idx] = input_data[i]; + ++output_idx; + } + prev_token = input_data[i]; + } + output_lod0.push_back(output_idx); + } + + // set output lod + framework::LoD output_lod; + output_lod.push_back(output_lod0); + output->set_lod(output_lod); + // resize output dims + output->Resize({static_cast(output_lod0.back()), 1}); + // for empty sequence + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->mutable_data(ctx.GetPlace()); + output_data[0] = -1; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3b2249147848b790833d09a0abe0370057ddd617 --- /dev/null +++ b/paddle/fluid/operators/cum_op.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class CumKernel : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + int axis = context.Attr("axis"); + bool exclusive = context.Attr("exclusive"); + bool reverse = context.Attr("reverse"); + auto x_dims = X.dims(); + if (axis == -1) { + axis = x_dims.size() - 1; + } + PADDLE_ENFORCE_LT( + axis, x_dims.size(), + "axis should be less than the dimensiotn of the input tensor"); + Out.mutable_data(context.GetPlace()); + + int pre = 1; + int post = 1; + int mid = x_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + for (int i = axis + 1; i < x_dims.size(); ++i) { + post *= x_dims[i]; + } + + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(mid), x, out, + /* axis= */ 0, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(mid, post), x, out, + /* axis= */ 0, reverse, exclusive); + } + } else { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, + /* axis= */ 1, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, + /* axis= */ 1, reverse, exclusive); + } + } + } + + private: + template + void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, + bool reverse, bool exclusive) const { + if (!reverse) { + out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); + } + } +}; + +template +struct CumsumFunctor { + using ELEMENT_TYPE = T; + template + const typename X::TensorScanSumOp operator()(X x, int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d15d4e3db35c4cd27f7b990a39a40af57acd5a65 --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace paddle { +namespace operators { + +class CumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Cumsum operator"); + AddOutput("Out", "Output of Cumsum operator"); + AddAttr("axis", + "(int, default -1). The dimenstion to accumulate along. " + "-1 means the last dimenstion") + .SetDefault(-1) + .EqualGreaterThan(-1); + AddAttr("exclusive", + "bool, default false). Whether to perform exclusive cumsum") + .SetDefault(false); + AddAttr("reverse", + "bool, default false). If true, the cumsum is performed in " + "the reversed direction") + .SetDefault(false); + AddComment(R"DOC( +The cumulative sum of the elements along a given axis. +By default, the first element of the result is the same of the first element of +the input. If exlusive is true, the first element of the result is 0. +)DOC"); + } +}; + +class CumsumGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("cumsum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", Attr("axis")); + grad_op->SetAttr("reverse", !Attr("reverse")); + grad_op->SetAttr("exclusive", Attr("exclusive")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker); +REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e063cc0f65a5d63f8f558c5f16e548a1e1fcd4f6 --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d827155919ed060df6bb45bbb54c286e81cb6c81 --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/decayed_adagrad_op.h" + +namespace paddle { +namespace operators { + +class DecayedAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of DecayedAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of DecayedAdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DecayedAdagradOp should have " + "the same dimension."); + PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of DecayedAdagradOp should have " + "the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("decay", + "(float, default 0.95) " + "Discounting factor for coming gradient") + .SetDefault(0.95); + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( +Decayed Adagrad Optimizer. + +The update is done as follows: + +$$ +moment\_out = decay * moment + (1 - decay) * grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon} +$$ + +The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) +does not have an epsilon attribute. It is added here for numerical +stability to avoid the division by zero error. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, + ops::DecayedAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/decayed_adagrad_op.cu b/paddle/fluid/operators/decayed_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..215d6dbc7d80405bf2fdd340c280c299de9e8cc7 --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/decayed_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..52b67586ea3d138f738956c450416698042590fa --- /dev/null +++ b/paddle/fluid/operators/decayed_adagrad_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DecayedAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + float decay = ctx.Attr("decay"); + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto& place = *ctx.template device_context().eigen_device(); + + moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt similarity index 100% rename from paddle/operators/detail/CMakeLists.txt rename to paddle/fluid/operators/detail/CMakeLists.txt diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d395d347ba4f48bae6b879c1daa3adb6f838e77 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "grpc_client.h" +#include "paddle/fluid/framework/threadpool.h" +namespace paddle { +namespace operators { +namespace detail { + +bool RPCClient::AsyncSendVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage req; + SerializeToMessage(var_name_val, var, *p_ctx, &req); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + SendProcessor* s = new SendProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = NULL; + + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); + + req_count_++; + + return true; +} + +void ProcGetResponse(const VarHandle& var_h, + const sendrecv::VariableMessage& ret_msg) { + auto* outvar = var_h.scope->FindVar(var_h.name); + DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); +} + +bool RPCClient::AsyncGetVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + GetProcessor* s = new GetProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = ProcGetResponse; + + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); + + req_count_++; + + return true; +} + +bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); + + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + s->Prepare(time_out); + + sendrecv::VariableMessage req; + req.set_varname(BATCH_BARRIER_MESSAGE); + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + req_count_++; + + return true; +} + +bool RPCClient::Wait() { + if (req_count_ <= 0) { + return true; + } + const size_t kReqCnt = req_count_; + bool a[kReqCnt]; + std::vector> waits(req_count_); + + for (int i = 0; i < req_count_; i++) { + waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + } + + for (int i = 0; i < req_count_; i++) { + waits[i].wait(); + } + + int last_req_count = req_count_; + req_count_ = 0; + + for (int i = 0; i < last_req_count; i++) { + if (!a[i]) { + return false; + } + } + + return true; +} + +bool RPCClient::Proceed() { + void* tag = NULL; + bool ok = false; + + // request counts. + if (!cq_.Next(&tag, &ok)) { + LOG(ERROR) << "Get meets CompletionQueue error"; + return false; + } + + GPR_ASSERT(ok); + PADDLE_ENFORCE(tag); + + // TODO(gongwb): add more retries. + ClientBase* c = static_cast(tag); + if (!c->status_.ok()) { + LOG(ERROR) << "proc param error:" << c->var_h_.String() + << " grpc error:" << c->status_.error_message(); + delete c; + return false; + } + + c->Process(); + delete c; + return true; +} + +std::shared_ptr RPCClient::GetChannel(const std::string& ep) { + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + + grpc::ChannelArguments args; + args.SetMaxSendMessageSize(std::numeric_limits::max()); + args.SetMaxReceiveMessageSize(std::numeric_limits::max()); + + auto ch = std::shared_ptr( + grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args)); + + channels_[ep] = ch; + return ch; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..314fe8168f0ecdae7b5d2737279050f54185e02a --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -0,0 +1,171 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" + +namespace paddle { +namespace operators { +namespace detail { + +struct VarHandle { + std::string ep; + const platform::DeviceContext* ctx; + const framework::Scope* scope; + std::string name; + + std::string String() const { + std::ostringstream s; + s << "name:[" << name << "] ep:[" << ep << "]"; + return s.str(); + } +}; + +void ProcGetResponse(const VarHandle& var_h, + const sendrecv::VariableMessage& msg); + +class ClientBase { + public: + explicit ClientBase(std::shared_ptr ch) { + stub_ = sendrecv::SendRecvService::NewStub(ch); + context_ = NULL; + } + + virtual ~ClientBase() {} + + virtual void Prepare(const VarHandle& var_info, int64_t time_out) { + context_.reset(new grpc::ClientContext()); + var_h_ = var_info; + + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); + + context_->set_deadline(deadline); + } + + virtual void Prepare(int64_t time_out) { + context_.reset(new grpc::ClientContext()); + + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); + + context_->set_deadline(deadline); + } + + virtual void Process() = 0; + + std::unique_ptr stub_; + std::unique_ptr context_; + grpc::Status status_; + VarHandle var_h_; +}; + +typedef std::function + RequestSendCallBack; + +class SendProcessor : public ClientBase { + public: + explicit SendProcessor(std::shared_ptr ch) : ClientBase(ch) {} + + virtual ~SendProcessor() {} + + virtual void Process() { + if (response_call_back_) { + response_call_back_(var_h_, reply_); + } + } + + sendrecv::VoidMessage reply_; + RequestSendCallBack response_call_back_ = NULL; +}; + +typedef std::function + RequestGetCallBack; + +class GetProcessor : public ClientBase { + public: + explicit GetProcessor(std::shared_ptr ch) : ClientBase(ch) {} + + virtual ~GetProcessor() {} + + virtual void Process() { + if (response_call_back_) { + response_call_back_(var_h_, reply_); + } + } + + sendrecv::VariableMessage reply_; + RequestGetCallBack response_call_back_ = ProcGetResponse; +}; + +class BatchBarrierProcessor : public ClientBase { + public: + explicit BatchBarrierProcessor(std::shared_ptr ch) + : ClientBase(ch) {} + + virtual ~BatchBarrierProcessor() {} + + virtual void Process() {} + sendrecv::VoidMessage reply_; +}; + +class RPCClient { + public: + bool AsyncSendVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = 600 * 1000); + + bool AsyncGetVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = 600 * 1000); + + bool AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = 600 * 1000); + + bool Wait(); + + private: + bool Proceed(); + std::shared_ptr GetChannel(const std::string& ep); + + private: + grpc::CompletionQueue cq_; + std::map> channels_; + int64_t req_count_ = 0; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..96f4ea797b1d8e82fa1c2a52a8b353259906bac2 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detail/grpc_server.h" + +using grpc::ServerAsyncResponseWriter; + +namespace paddle { +namespace operators { +namespace detail { + +enum CallStatus { PROCESS = 0, FINISH }; + +// reference: +// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server +class RequestBase { + public: + explicit RequestBase(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq) + : service_(service), cq_(cq), status_(PROCESS) { + PADDLE_ENFORCE(cq_); + } + virtual ~RequestBase() {} + virtual void Process() { assert(false); } + + CallStatus Status() { return status_; } + void SetStatus(CallStatus status) { status_ = status; } + virtual std::string GetReqName() { + assert(false); + return ""; + } + + protected: + grpc::ServerContext ctx_; + sendrecv::SendRecvService::AsyncService* service_; + grpc::ServerCompletionQueue* cq_; + CallStatus status_; +}; + +typedef std::pair MessageWithName; + +class RequestSend final : public RequestBase { + public: + explicit RequestSend(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq, + SimpleBlockQueue* queue) + : RequestBase(service, cq), queue_(queue), responder_(&ctx_) { + service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_, + this); + } + + virtual ~RequestSend() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + MessageWithName msg_with_name = + std::make_pair(request_.varname(), std::move(request_)); + queue_->Push(std::move(msg_with_name)); + responder_.Finish(reply_, grpc::Status::OK, this); + status_ = FINISH; + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + SimpleBlockQueue* queue_; + ServerAsyncResponseWriter responder_; +}; + +class RequestGet final : public RequestBase { + public: + explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, + grpc::ServerCompletionQueue* cq, framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + SimpleBlockQueue* queue) + : RequestBase(service, cq), + responder_(&ctx_), + scope_(scope), + dev_ctx_(dev_ctx), + queue_(queue) { + service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this); + } + + virtual ~RequestGet() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + // proc request. + std::string var_name = request_.varname(); + auto* var = scope_->FindVar(var_name); + SerializeToMessage(var_name, var, *dev_ctx_, &reply_); + // TODO(gongwb): check var's info. + responder_.Finish(reply_, grpc::Status::OK, this); + status_ = FINISH; + queue_->Push('c'); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VariableMessage reply_; + ServerAsyncResponseWriter responder_; + framework::Scope* scope_; + const platform::DeviceContext* dev_ctx_; + SimpleBlockQueue* queue_; +}; + +void AsyncGRPCServer::WaitClientGet(int count) { + for (int i = 0; i < count; ++i) { + var_get_queue_.Pop(); + } +} + +void AsyncGRPCServer::RunSyncUpdate() { + grpc::ServerBuilder builder; + builder.AddListeningPort(address_, grpc::InsecureServerCredentials()); + builder.SetMaxSendMessageSize(std::numeric_limits::max()); + builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); + builder.RegisterService(&service_); + + cq_send_ = builder.AddCompletionQueue(); + cq_get_ = builder.AddCompletionQueue(); + + server_ = builder.BuildAndStart(); + LOG(INFO) << "Server listening on " << address_ << std::endl; + + std::function send_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); + std::function get_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); + + t_send_.reset( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_send_.get(), "cq_send", send_register))); + + t_get_.reset( + new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, + cq_get_.get(), "cq_get", get_register))); + + // wait server + server_->Wait(); + t_send_->join(); + t_get_->join(); +} + +void AsyncGRPCServer::ShutdownQueue() { + std::unique_lock lock(cq_mutex_); + cq_send_->Shutdown(); + cq_get_->Shutdown(); + is_shut_down_ = true; +} + +// This URL explains why shutdown is complicate: +void AsyncGRPCServer::ShutDown() { + server_->Shutdown(); + ShutdownQueue(); +} + +void AsyncGRPCServer::TryToRegisterNewSendOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestSend* send = + new RequestSend(&service_, cq_send_.get(), &var_recv_queue_); + VLOG(4) << "Create RequestSend status:" << send->Status(); +} + +void AsyncGRPCServer::TryToRegisterNewGetOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_, + &var_get_queue_); + VLOG(4) << "Create RequestGet status:" << get->Status(); +} + +// FIXME(typhoonzero): change cq_name to enum. +void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq, + std::string cq_name, + std::function TryToRegisterNewOne) { + TryToRegisterNewOne(); + + void* tag = NULL; + bool ok = false; + while (true) { + if (!cq->Next(&tag, &ok)) { + LOG(INFO) << cq_name << " get CompletionQueue shutdown!"; + break; + } + + PADDLE_ENFORCE(tag); + // FIXME(typhoonzero): de-couple the barriers with recv_op + if (cq_name == "cq_get") WaitCond(1); + if (cq_name == "cq_send") WaitCond(0); + + RequestBase* base = (RequestBase*)tag; + // reference: + // https://github.com/tensorflow/tensorflow/issues/5596 + // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM + // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I + if (!ok) { + LOG(WARNING) << cq_name << " recv no regular event:argument name" + << base->GetReqName(); + TryToRegisterNewOne(); + delete base; + continue; + } + + switch (base->Status()) { + case PROCESS: { + VLOG(4) << cq_name << " status:" << base->Status(); + TryToRegisterNewOne(); + base->Process(); + break; + } + case FINISH: { + VLOG(4) << cq_name << " status:" << base->Status(); + delete base; + break; + } + default: { assert(false); } + } + } +} + +void AsyncGRPCServer::WaitCond(int cond) { + std::unique_lock lock(this->barrier_mutex_); + barrier_condition_.wait(lock, + [=] { return this->barrier_cond_step_ == cond; }); +} + +void AsyncGRPCServer::SetCond(int cond) { + { + std::lock_guard lock(this->barrier_mutex_); + barrier_cond_step_ = cond; + } + barrier_condition_.notify_all(); +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..1382d1731838c72008ef782d1c398f534f23f7e6 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" + +#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +#include +#include +#include +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace detail { + +typedef std::pair MessageWithName; +class RequestBase; + +class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { + public: + explicit AsyncGRPCServer(const std::string &address) : address_(address) {} + + void RunSyncUpdate(); + + // functions to sync server barrier status. + void WaitCond(int cond); + void SetCond(int cond); + void WaitClientGet(int count); + + void SetScope(framework::Scope *scope) { scope_ = scope; } + + void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } + + const MessageWithName Get() { return this->var_recv_queue_.Pop(); } + + void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); } + + void ShutDown(); + + protected: + void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name, + std::function TryToRegisterNewOne); + void TryToRegisterNewSendOne(); + void TryToRegisterNewGetOne(); + void ShutdownQueue(); + + private: + std::mutex cq_mutex_; + volatile bool is_shut_down_ = false; + std::unique_ptr cq_send_; + std::unique_ptr cq_get_; + + sendrecv::SendRecvService::AsyncService service_; + std::unique_ptr server_; + + std::string address_; + framework::Scope *scope_; + const platform::DeviceContext *dev_ctx_; + // received variable from RPC, operators fetch variable from this queue. + SimpleBlockQueue var_recv_queue_; + SimpleBlockQueue var_get_queue_; + + // condition of the sub program + std::mutex barrier_mutex_; + mutable int barrier_cond_step_; + std::condition_variable barrier_condition_; + + std::unique_ptr t_send_; + std::unique_ptr t_get_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h similarity index 100% rename from paddle/operators/detail/safe_ref.h rename to paddle/fluid/operators/detail/safe_ref.h diff --git a/paddle/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto similarity index 100% rename from paddle/operators/detail/send_recv.proto rename to paddle/fluid/operators/detail/send_recv.proto diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba3ae6add6099cb232a5e0df82550b9c2628c05c --- /dev/null +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace detail { + +void SerializeToMessage(const std::string& name, const framework::Variable* var, + const platform::DeviceContext& ctx, + sendrecv::VariableMessage* msg) { + msg->set_varname(name); + std::ostringstream oss; + switch (framework::ToVarType(var->Type())) { + case framework::proto::VarDesc_VarType_LOD_TENSOR: + msg->set_type(sendrecv::VarType::LOD_TENSOR); + framework::SerializeToStream(oss, var->Get(), ctx); + break; + case framework::proto::VarDesc_VarType_SELECTED_ROWS: + msg->set_type(sendrecv::VarType::SELECTED_ROWS); + framework::SerializeToStream(oss, var->Get(), + ctx); + break; + default: { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + } + msg->set_serialized(oss.str()); +} + +void DeserializeFromMessage(const sendrecv::VariableMessage& msg, + const platform::DeviceContext& ctx, + framework::Variable* var) { + std::istringstream iss(msg.serialized()); + switch (msg.type()) { + case sendrecv::VarType::LOD_TENSOR: + DeserializeFromStream(iss, var->GetMutable(), ctx); + break; + case sendrecv::VarType::SELECTED_ROWS: { + DeserializeFromStream(iss, var->GetMutable(), + ctx); + break; + } + default: { + PADDLE_THROW("Deserialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + } +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fed887c02796463b4b1b7a747883c702c2a95a72 --- /dev/null +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace detail { + +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" + +void SerializeToMessage(const std::string& name, const framework::Variable* var, + const platform::DeviceContext& ctx, + sendrecv::VariableMessage* msg); + +void DeserializeFromMessage(const sendrecv::VariableMessage& msg, + const platform::DeviceContext& ctx, + framework::Variable* var); +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h similarity index 100% rename from paddle/operators/detail/simple_block_queue.h rename to paddle/fluid/operators/detail/simple_block_queue.h diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..d7a7eed50b961b0efc04a2a636178fa6578cbf3a --- /dev/null +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace detail { + +template +struct StridedMemcpyFunctor; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + framework::Dim<1> src_stride, framework::Dim<1> dst_dim, + framework::Dim<1> dst_stride, T* dst) const { + auto place = dev_ctx.GetPlace(); + if (platform::is_cpu_place(place)) { + auto& cpu_place = boost::get(place); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + } else { +#ifdef PADDLE_WITH_CUDA + auto& gpu_place = boost::get(place); + auto& cuda_ctx = + reinterpret_cast(dev_ctx); + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + cuda_ctx.stream()); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } +}; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + framework::Dim src_stride, framework::Dim dst_dim, + framework::Dim dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim.head; ++i) { + StridedMemcpyFunctor func; + func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); + src += src_stride.head; + dst += dst_stride.head; + } + } +}; + +template +struct StridedCopyDimVisitor : public boost::static_visitor { + StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, + const framework::DDim& src_stride, + const framework::DDim& dst_stride, T* dst) + : dev_ctx_(dev_ctx), + src_(src), + src_stride_(src_stride), + dst_stride_(dst_stride), + dst_(dst) {} + + template + void operator()(Dim dst_dim) const { + Dim src_stride = boost::get(src_stride_); + Dim dst_stride = boost::get(dst_stride_); + constexpr int dim = Dim::dimensions; + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + } + + const platform::DeviceContext& dev_ctx_; + const T* src_; + const framework::DDim& src_stride_; + const framework::DDim& dst_stride_; + T* dst_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6dee5222959f00141cf5c09257d4a2c96b9e3746 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection_output_op.h" +namespace paddle { +namespace operators { + +class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Loc", + "(Tensor) The input tensor of detection_output operator." + "The input predict locations" + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is 4, H and W both are 1."); + AddInput("Conf", + "(Tensor) The input tensor of detection_output operator." + "The input priorbox confidence." + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is the number of classes, H and W both are 1."); + AddInput("PriorBox", + "(Tensor) The input tensor of detection_output operator." + "The format of input tensor is the position and variance " + "of the boxes"); + AddOutput("Out", + "(Tensor) The output tensor of detection_output operator."); + AddAttr("background_label_id", "(int), The background class index."); + AddAttr("num_classes", "(int), The number of the classification."); + AddAttr("nms_threshold", + "(float), The Non-maximum suppression threshold."); + AddAttr("confidence_threshold", + "(float), The classification confidence threshold."); + AddAttr("top_k", "(int), The bbox number kept of the layer’s output."); + AddAttr("nms_top_k", + "(int), The bbox number kept of the NMS’s output."); + AddComment(R"DOC( + detection output for SSD(single shot multibox detector) + Apply the NMS to the output of network and compute the predict + bounding box location. The output’s shape of this layer could + be zero if there is no valid bounding box. + )DOC"); + } +}; + +class DetectionOutputOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Loc"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Conf"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(X) of DetectionOutputOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of DetectionOutputOp should not be null."); + std::vector output_shape({1, 7}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp, + ops::DetectionOutputOpMaker); +REGISTER_OP_CPU_KERNEL( + detection_output, + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..309e03a25be95362b11689f873bafe68570c42e4 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection_output_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + detection_output, + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h new file mode 100644 index 0000000000000000000000000000000000000000..05e5b72bd354329d575c33a88189cfbc64abfea9 --- /dev/null +++ b/paddle/fluid/operators/detection_output_op.h @@ -0,0 +1,167 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/detection_util.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/strided_memcpy.h" +namespace paddle { +namespace operators { +template +inline void transpose_fun(const framework::ExecutionContext& context, + const framework::Tensor& src, + framework::Tensor* dst) { + int input_nums = src.dims()[0]; + int offset = 0; + for (int j = 0; j < input_nums; ++j) { + framework::Tensor in_p_tensor = src.Slice(j, j + 1); + std::vector shape_vec( + {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3], + in_p_tensor.dims()[4], in_p_tensor.dims()[2]}); + framework::DDim shape(framework::make_ddim(shape_vec)); + framework::Tensor in_p_tensor_transpose; + in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); + std::vector shape_axis({0, 1, 3, 4, 2}); + math::Transpose trans5; + trans5(context.template device_context(), in_p_tensor, + &in_p_tensor_transpose, shape_axis); + auto dst_stride = framework::stride(dst->dims()); + auto src_stride = framework::stride(in_p_tensor_transpose.dims()); + StridedMemcpy(context.device_context(), in_p_tensor_transpose.data(), + src_stride, in_p_tensor_transpose.dims(), dst_stride, + dst->data() + offset); + offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; + } +} +template +class DetectionOutputKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_loc = context.Input("Loc"); + const framework::Tensor* in_conf = context.Input("Conf"); + const framework::Tensor* in_priorbox = + context.Input("PriorBox"); + auto* out = context.Output("Out"); + int num_classes = context.template Attr("num_classes"); + int top_k = context.template Attr("top_k"); + int nms_top_k = context.template Attr("nms_top_k"); + int background_label_id = context.template Attr("background_label_id"); + float nms_threshold = context.template Attr("nms_threshold"); + float confidence_threshold = + context.template Attr("confidence_threshold"); + size_t batch_size = in_conf->dims()[1]; + int conf_sum_size = in_conf->numel(); + // for softmax + std::vector conf_shape_softmax_vec( + {conf_sum_size / num_classes, num_classes}); + framework::DDim conf_shape_softmax( + framework::make_ddim(conf_shape_softmax_vec)); + // for knchw => nhwc + std::vector loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3], + in_loc->dims()[4], + in_loc->dims()[2] * in_loc->dims()[0]}); + std::vector conf_shape_vec( + {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4], + in_conf->dims()[2] * in_conf->dims()[0]}); + framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); + framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); + framework::Tensor loc_tensor; + framework::Tensor conf_tensor; + loc_tensor.mutable_data(loc_shape, context.GetPlace()); + conf_tensor.mutable_data(conf_shape, context.GetPlace()); + // for cpu + framework::Tensor loc_cpu; + framework::Tensor conf_cpu; + framework::Tensor priorbox_cpu; + const T* priorbox_data = in_priorbox->data(); + transpose_fun(context, *in_loc, &loc_tensor); + transpose_fun(context, *in_conf, &conf_tensor); + conf_tensor.Resize(conf_shape_softmax); + math::SoftmaxFunctor()( + context.template device_context(), &conf_tensor, + &conf_tensor); + T* loc_data = loc_tensor.data(); + T* conf_data = conf_tensor.data(); + if (platform::is_gpu_place(context.GetPlace())) { + loc_cpu.mutable_data(loc_tensor.dims(), platform::CPUPlace()); + framework::Copy(loc_tensor, platform::CPUPlace(), + context.device_context(), &loc_cpu); + loc_data = loc_cpu.data(); + conf_cpu.mutable_data(conf_tensor.dims(), platform::CPUPlace()); + framework::Copy(conf_tensor, platform::CPUPlace(), + context.device_context(), &conf_cpu); + conf_data = conf_cpu.data(); + priorbox_cpu.mutable_data(in_priorbox->dims(), platform::CPUPlace()); + framework::Copy(*in_priorbox, platform::CPUPlace(), + context.device_context(), &priorbox_cpu); + priorbox_data = priorbox_cpu.data(); + } + // get decode bboxes + size_t num_priors = in_priorbox->numel() / 8; + std::vector>> all_decoded_bboxes; + for (size_t n = 0; n < batch_size; ++n) { + std::vector> decoded_bboxes; + for (size_t i = 0; i < num_priors; ++i) { + size_t prior_offset = i * 8; + size_t loc_pred_offset = n * num_priors * 4 + i * 4; + std::vector> prior_bbox_vec; + math::GetBBoxFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_vec); + std::vector> prior_bbox_var; + math::GetBBoxVarFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_var); + std::vector loc_pred_data; + for (size_t j = 0; j < 4; ++j) + loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); + math::BBox bbox = math::DecodeBBoxWithVar( + prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); + decoded_bboxes.push_back(bbox); + } + all_decoded_bboxes.push_back(decoded_bboxes); + } + std::vector>> all_indices; + int num_kept = math::GetDetectionIndices( + conf_data, num_priors, num_classes, background_label_id, batch_size, + confidence_threshold, nms_top_k, nms_threshold, top_k, + all_decoded_bboxes, &all_indices); + + if (num_kept <= 0) { + std::vector out_shape_vec({0, 0}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->Resize(out_shape); + return; + } + std::vector out_shape_vec({num_kept, 7}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->mutable_data(out_shape, context.GetPlace()); + framework::Tensor out_cpu; + T* out_data = out->data(); + if (platform::is_gpu_place(context.GetPlace())) { + out_cpu.mutable_data(out->dims(), platform::CPUPlace()); + out_data = out_cpu.data(); + } + math::GetDetectionOutput(conf_data, num_kept, num_priors, num_classes, + batch_size, all_indices, all_decoded_bboxes, + out_data); + if (platform::is_gpu_place(context.GetPlace())) { + framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(), + out); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1dc900512cb2c20cc1a39b5d11a78f5eb905dc5 --- /dev/null +++ b/paddle/fluid/operators/dropout_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dropout_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class DropoutOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + if (ctx->Attrs().Get("is_test") == false) { + ctx->SetOutputDim("Mask", x_dims); + } + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { + public: + DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of dropout op."); + AddOutput("Out", "The output of dropout op."); + AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate(); + + AddAttr("dropout_prob", "Probability of setting units to zero.") + .SetDefault(.5f) + .AddCustomChecker([](const float& drop_p) { + PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f, + "'dropout_prob' must be between 0.0 and 1.0."); + }); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest or for debug that always the same output units " + "will be dropped.") + .SetDefault(false); + AddAttr("seed", "Dropout random seed.").SetDefault(0); + + AddComment(R"DOC( +Dropout Operator. + +Dropout refers to randomly dropping out units in a nerual network. It is a +regularization technique for reducing overfitting by preventing neuron +co-adaption during training. The dropout operator randomly set (according to +the given dropout probability) the outputs of some units to zero, while others +are set equal to their corresponding inputs. + +)DOC"); + } +}; + +template +class DropoutOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->Attrs().Get("is_test"), false, + "GradOp is only callable when is_test is false"); + + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ(x_dims, out_dims, + "Dimensions of Input(X) and Out@Grad must be the same."); + auto mask_dims = ctx->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(x_dims, mask_dims, + "Dimensions of Input(X) and Mask must be the same."); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, + ops::DropoutOpGrad); +REGISTER_OP_CPU_KERNEL( + dropout, + ops::CPUDropoutKernel); +REGISTER_OP_CPU_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4ae9f4ce54d27dd1ad0312b5ad8d78a4cb904c79 --- /dev/null +++ b/paddle/fluid/operators/dropout_op.cu @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include +#include +#include +#include +#include "paddle/fluid/operators/dropout_op.h" + +namespace paddle { +namespace operators { + +template +struct MaskGenerator { + AttrType dropout_prob; + int seed; + + __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed) + : dropout_prob(dropout_prob), seed(seed) {} + + inline __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed); + thrust::uniform_real_distribution dist(0, 1); + rng.discard(n); + if (dist(rng) < dropout_prob) { + return static_cast(0); + } + return static_cast(1); + } +}; + +// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class GPUDropoutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + y->mutable_data(context.GetPlace()); + AttrType dropout_prob = context.Attr("dropout_prob"); + + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); + + auto& place = *context.template device_context().eigen_device(); + if (!context.Attr("is_test")) { + auto* mask = context.Output("Mask"); + auto* mask_data = mask->mutable_data(context.GetPlace()); + int size = framework::product(mask->dims()); + + std::random_device rnd; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + + thrust::counting_iterator index_sequence_begin(0); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(mask_data), + MaskGenerator(dropout_prob, seed)); + auto M = EigenMatrix::Reshape(*mask, 1); + Y.device(place) = X * M; + } else { + Y.device(place) = X * (1.0f - dropout_prob); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + dropout, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9dd1f33669ccf89202abe4a80bd9796411f630ba --- /dev/null +++ b/paddle/fluid/operators/dropout_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CPUDropoutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + const auto* x_data = x->data(); + auto* y_data = y->mutable_data(context.GetPlace()); + float dropout_prob = context.Attr("dropout_prob"); + + if (!context.Attr("is_test")) { + auto* mask = context.Output("Mask"); + auto* mask_data = mask->mutable_data(context.GetPlace()); + + // NOTE: fixed seed should only be used in unittest or for debug. + // Guarantee to use random seed in training. + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + std::uniform_real_distribution dist(0, 1); + size_t size = framework::product(mask->dims()); + for (size_t i = 0; i < size; ++i) { + if (dist(engine) < dropout_prob) { + mask_data[i] = 0; + y_data[i] = 0; + } else { + mask_data[i] = 1; + y_data[i] = x_data[i]; + } + } + } else { + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); + auto& place = + *context.template device_context().eigen_device(); + Y.device(place) = X * (1.0f - dropout_prob); + } + } +}; + +template +class DropoutGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(!context.Attr("is_test"), + "GradOp is only callable when is_test is false"); + + auto* grad_x = context.Output(framework::GradVarName("X")); + auto* grad_y = context.Input(framework::GradVarName("Out")); + auto* mask = context.Input("Mask"); + grad_x->mutable_data(context.GetPlace()); + + auto M = EigenMatrix::Reshape(*mask, 1); + auto dX = EigenMatrix::Reshape(*grad_x, 1); + auto dY = EigenMatrix::Reshape(*grad_y, 1); + + auto& place = + *context.template device_context().eigen_device(); + dX.device(place) = dY * M; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae82408da71f9424a7a64dc9d3e42759707683b9 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/edit_distance_op.h" + +namespace paddle { +namespace operators { + +class EditDistanceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"), + "Output(SequenceNum) shouldn't be null."); + auto hyp_dims = ctx->GetInputDim("Hyps"); + auto ref_dims = ctx->GetInputDim("Refs"); + PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1, + "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension " + "equal to 1."); + PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1, + "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension " + "equal to 1."); + ctx->SetOutputDim("Out", ctx->GetInputDim("Refs")); + ctx->SetOutputDim("SequenceNum", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(framework::proto::DataType::FP32, + ctx.device_context()); + } +}; + +class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Hyps", + "(2-D LoDTensor, 2nd dim. equal to 1) " + "The indices for hypothesis strings."); + AddInput("Refs", + "(2-D LoDTensor, 2nd dim. equal to 1) " + "The indices for reference strings."); + AddOutput("SequenceNum", "The sequence count of current batch"); + AddAttr("normalized", + "(bool, default false) Indicated whether to normalize " + "the edit distance by the length of reference string.") + .SetDefault(false); + AddOutput("Out", + "(2-D Tensor with shape [`batch_size` x 1]) " + "The output edit distances of EditDistance operator."); + AddComment(R"DOC( + +EditDistance operator computes the edit distances between a batch of hypothesis +strings and their references. + +Edit distance, also called Levenshtein distance, measures how dissimilar two strings +are by counting the minimum number of operations to transform one string into anthor. +Here the operations include insertion, deletion, and substitution. For example, +given hypothesis string A = "kitten" and reference B = "sitting", the edit distance +is 3 for A will be transformed into B at least after two substitutions and one +insertion: + + "kitten" -> "sitten" -> "sittin" -> "sitting" + +Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total +number denoted by `batch_size`, and the separation is specified by the LoD information. +And the `batch_size` reference strings are arranged in order in the same way in the +LoDTensor Input(Refs). + +Output(Out) contains the `batch_size` results and each stands for the edit stance +for a pair of strings respectively. If Attr(normalized) is true, the edit distance +will be divided by the length of reference string. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + edit_distance, ops::EditDistanceKernel); diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bdfead75e71752549f44a8b3c9b9e4501e8845a3 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillFirstRow(T* dist, const int N) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < N + 1) { + dist[idx] = idx; + } +} + +template +__global__ void FillFirstColumn(T* dist, const int M, const int N) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < M + 1) { + dist[idx * (N + 1)] = idx; + } +} + +template +__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2, + const int M, const int N, const int start) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int offset = N; + int index = start + idx * offset; + int row = index / (N + 1); + int col = index % (N + 1); + if (row > 0 && col > 0 && row < M + 1 && col < N + 1) { + int cost = x1[row - 1] == x2[col - 1] ? 0 : 1; + int dels = dist[(row - 1) * (N + 1) + col] + 1; + int ins = dist[row * (N + 1) + col - 1] + 1; + int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost; + dist[index] = min(dels, min(ins, subs)); + } +} + +template +__global__ void SetOutput(T* out, const T* dist, const int M, const int N, + bool normalized) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx == 0) { + out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N]; + } +} + +template +class EditDistanceGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + + auto* x1_t = ctx.Input("Hyps"); + auto* x2_t = ctx.Input("Refs"); + auto* sequence_num = ctx.Output("SequenceNum"); + sequence_num->mutable_data(ctx.GetPlace()); + + auto normalized = ctx.Attr("normalized"); + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + + auto hyp_lod = x1_t->lod()[0]; + auto ref_lod = x2_t->lod()[0]; + PADDLE_ENFORCE( + hyp_lod.size() == ref_lod.size(), + "Input(Hyps) and Input(Refs) must have the same batch size."); + for (size_t i = 1; i < ref_lod.size(); ++i) { + PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], + "Reference string %d is empty.", i); + } + + const size_t num_strs = hyp_lod.size() - 1; + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + sequence_num, static_cast(num_strs)); + + out_t->Resize({static_cast(num_strs), 1}); + out_t->mutable_data(ctx.GetPlace()); + auto out = out_t->data(); + + T distance = 0.0; + for (size_t num = 0; num < num_strs; num++) { + auto m = static_cast(hyp_lod[num + 1] - hyp_lod[num]); + auto n = static_cast(ref_lod[num + 1] - ref_lod[num]); + if (m == 0 || n == 0) { + distance = std::max(m, n); + if (normalized) { + PADDLE_ENFORCE(n > 0, + "The reference string (#%d) cannot be empty " + "when Attr(normalized) is enabled.", + n); + distance = distance / n; + } + memory::Copy(boost::get(ctx.GetPlace()), out + num, + platform::CPUPlace(), &distance, sizeof(T), stream); + } else { + framework::Tensor dist_t; + dist_t.Resize({m + 1, n + 1}); + dist_t.mutable_data(ctx.GetPlace()); + auto dist = dist_t.data(); + auto x1 = x1_t->data() + hyp_lod[num]; + auto x2 = x2_t->data() + ref_lod[num]; + + FillFirstColumn<<<1 + m / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n); + + FillFirstRow<<<1 + n / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n); + // Compute the elements of distance matrix in the anti-diagonal diretion + for (int64_t slice = 2; slice < m + n + 1; ++slice) { + int z_m = slice < m + 1 ? 0 : slice - m; + int z_n = slice < n + 1 ? 0 : slice - n; + int size = slice - (z_m + z_n) + 1; // number of elments in the same + // anti-diagonal line to update + // the start index at which computes from + int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1; + Levenshtein<<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2, + m, n, start); + } + SetOutput<<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + edit_distance, + ops::EditDistanceGPUKernel); diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h new file mode 100644 index 0000000000000000000000000000000000000000..205e16e6bfe6b2d1678fca258ce1e70d29eff331 --- /dev/null +++ b/paddle/fluid/operators/edit_distance_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace operators { + +template +class EditDistanceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + + auto* x1_t = ctx.Input("Hyps"); + auto* x2_t = ctx.Input("Refs"); + auto* sequence_num = ctx.Output("SequenceNum"); + int64_t* seq_num_data = sequence_num->mutable_data(ctx.GetPlace()); + + auto normalized = ctx.Attr("normalized"); + + auto hyp_lod = x1_t->lod()[0]; + auto ref_lod = x2_t->lod()[0]; + PADDLE_ENFORCE( + hyp_lod.size() == ref_lod.size(), + "Input(Hyps) and Input(Refs) must have the same batch size."); + for (size_t i = 1; i < ref_lod.size(); ++i) { + PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], + "Reference string %d is empty.", i); + } + auto num_strs = hyp_lod.size() - 1; + *seq_num_data = static_cast(num_strs); + + out_t->Resize({static_cast(num_strs), 1}); + out_t->mutable_data(ctx.GetPlace()); + auto out = out_t->data(); + + T distance = 0.0; + for (size_t num = 0; num < num_strs; ++num) { + auto m = static_cast(hyp_lod[num + 1] - hyp_lod[num]); + auto n = static_cast(ref_lod[num + 1] - ref_lod[num]); + + if (m == 0) { + distance = n; + } else if (n == 0) { + distance = m; + } else { + framework::Tensor dist_t; + dist_t.Resize({m + 1, n + 1}); + dist_t.mutable_data(ctx.GetPlace()); + auto dist = dist_t.data(); + auto x1 = x1_t->data() + hyp_lod[num]; + auto x2 = x2_t->data() + ref_lod[num]; + for (int64_t i = 0; i < m + 1; ++i) { + dist[i * (n + 1)] = i; + } + for (int64_t j = 0; j < n + 1; ++j) { + dist[j] = j; + } + for (int64_t i = 1; i < m + 1; ++i) { + for (int64_t j = 1; j < n + 1; ++j) { + int cost = x1[i - 1] == x2[j - 1] ? 0 : 1; + int dels = dist[(i - 1) * (n + 1) + j] + 1; + int ins = dist[i * (n + 1) + (j - 1)] + 1; + int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost; + dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs)); + } + } + distance = dist[m * (n + 1) + n]; + } + + if (normalized) { + PADDLE_ENFORCE(n > 0, + "The reference string (#%d) cannot be empty " + "when Attr(normalized) is enabled.", + n); + distance = distance / n; + } + out[num] = distance; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b9947b8c935fd2b4739162cbd2f98dc965cec2a --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_add_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseAddOpMaker : public ElementwiseOpMaker { + public: + ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Add", "Out = X + Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, + elementwise_add_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_add, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_add_grad, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2ac3a998ec46528ccb72fad1f5d73ce88992d995 --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_add_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_add, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_grad, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h new file mode 100644 index 0000000000000000000000000000000000000000..248e3b9d617fadd914f27ad02861e27078600b61 --- /dev/null +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +class ElementwiseAddKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); + } +}; + +template +struct ElementwiseAddGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e; + } + } +}; + +template +struct ElementwiseAddBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e.reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseAddBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = dz_e.reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseAddGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseAddBroadCastGradFunctor, + ElementwiseAddBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..818ae82f44ccd159b36944e67521c3b730214539 --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_div_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseDivOpMaker : public ElementwiseOpMaker { + public: + ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Div", "Out = X / Y"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, + elementwise_div_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d1bb7a474c06f68d33412512a9eb99757634d18e --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_div_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8e0726d9465fa988646f6f8dc74857a3bedf43e8 --- /dev/null +++ b/paddle/fluid/operators/elementwise_div_op.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct DivFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a / b; } +}; + +template +class ElementwiseDivKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + DivFunctor(), z); + } +}; + +template +struct ElementwiseDivGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto y_e = framework::EigenVector::Flatten(*y); + auto z_e = framework::EigenVector::Flatten(*z); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = -1.0 * dz_e * z_e / y_e; + } + } +}; + +template +struct ElementwiseDivBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseDivBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e / y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseDivGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseDivBroadCastGradFunctor, + ElementwiseDivBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1331bcadc8ce9a114d3dd7604273a3512e821e91 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_max_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseMaxOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Max", "Out = max(X, Y)"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker, + elementwise_max_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_max, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_max_grad, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise_max_op.cu b/paddle/fluid/operators/elementwise_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f0259ad0024c1a9b640f73ade5711b6eaa8f871 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_max_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_max, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel, + ops::ElementwiseMaxKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_max_grad, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel, + ops::ElementwiseMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e1db9bcc01104c3462b562b7a37b5817e867d7e4 --- /dev/null +++ b/paddle/fluid/operators/elementwise_max_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MaxFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; } +}; + +template +class ElementwiseMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MaxFunctor(), z); + } +}; + +template +struct ElementwiseMaxGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e).template cast() * dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e <= y_e).template cast() * dz_e; + } + } +}; + +template +struct ElementwiseMaxBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMaxBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMaxBroadCastGradFunctor, + ElementwiseMaxBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d69099c8e6bd8e11a605758f553a9edd9cc322e --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_min_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseMinOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Max", "Out = min(X, Y)"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker, + elementwise_min_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_min, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_min_grad, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel); diff --git a/paddle/fluid/operators/elementwise_min_op.cu b/paddle/fluid/operators/elementwise_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed53204735056477b5c59ce5082d377501409c65 --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_min_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_min, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel, + ops::ElementwiseMinKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_min_grad, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel, + ops::ElementwiseMinGradKernel); diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bfe213dd4318aef8b1fb299ba40981d2feef8f9d --- /dev/null +++ b/paddle/fluid/operators/elementwise_min_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MinFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; } +}; + +template +class ElementwiseMinKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MinFunctor(), z); + } +}; + +template +struct ElementwiseMinGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e).template cast() * dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e >= y_e).template cast() * dz_e; + } + } +}; + +template +struct ElementwiseMinBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMinBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMinGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMinBroadCastGradFunctor, + ElementwiseMinBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cb96f21d1b5049f9c7193f0a951e08986884c70 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMulOpMaker : public ElementwiseOpMaker { + public: + ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Mul", "Out = X \\odot\\ Y"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, + elementwise_mul_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_mul, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_mul_grad, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d72b6250eed24adbfc18e95a659a82b8e9916bc1 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_mul_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_mul, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_mul_grad, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dc292eb1e7295780aacb0c34730044c3c8759cb7 --- /dev/null +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +class ElementwiseMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); + } +}; + +template +struct ElementwiseMulGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = x_e * dz_e; + } + } +}; + +template +struct ElementwiseMulBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) + .broadcast(Eigen::DSizes(pre, 1)) + .reshape(Eigen::DSizes(x_e.size())); + + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e * dz_e) + .reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseMulBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto x_e = framework::EigenVector::Flatten(*x); + auto y_e = framework::EigenVector::Flatten(*y); + auto dz_e = framework::EigenVector::Flatten(*dz); + + auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) + .broadcast(Eigen::DSizes(pre, 1, post)) + .reshape(Eigen::DSizes(x_e.size())); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e * y_e_bcast; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (x_e * dz_e) + .reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseMulBroadCastGradFunctor, + ElementwiseMulBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h new file mode 100644 index 0000000000000000000000000000000000000000..38f83d7ad36d3cb6b42a283fec3c431b51747d4e --- /dev/null +++ b/paddle/fluid/operators/elementwise_op.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +class ElementwiseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + using Tensor = framework::Tensor; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + ctx->SetOutputDim("Out", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), The first input tensor of elementwise op."); + AddInput("Y", "(Tensor), The second input tensor of elementwise op."); + AddOutput("Out", "The output of elementwise op."); + AddAttr("axis", + "(int, default -1). The start dimension index " + "for broadcasting Y onto X.") + .SetDefault(-1) + .EqualGreaterThan(-1); + comment_ = R"DOC( +Limited Elementwise {name} Operator. + +The equation is: + +$${equation}$$ + +$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be +smaller than or equal to the dimensions of $X$. + +There are two cases for this operator: +1. The shape of $Y$ is same with $X$; +2. The shape of $Y$ is a subset of $X$. + +For case 2: +$Y$ will be broadcasted to match the shape of $X$ and axis should be +set to index of the start dimension to broadcast $Y$ onto $X$. + +For example + .. code-block:: python + + shape(X) = (2, 3, 4, 5), shape(Y) = (,) + shape(X) = (2, 3, 4, 5), shape(Y) = (5,) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) + shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 + +Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details) +information. However, the output only shares the LoD information with input $X$. + +)DOC"; + AddComment(comment_); + } + + protected: + std::string comment_; + + void Replace(std::string& src, std::string from, std::string to) { + std::size_t len_from = std::strlen(from.c_str()); + std::size_t len_to = std::strlen(to.c_str()); + for (std::size_t pos = src.find(from); pos != std::string::npos; + pos = src.find(from, pos + len_to)) { + src.replace(pos, len_from, to); + } + } + + void SetComment(std::string name, std::string equation) { + Replace(comment_, "{name}", name); + Replace(comment_, "{equation}", equation); + } +}; + +class ElementwiseOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h new file mode 100644 index 0000000000000000000000000000000000000000..c1269382a447d4f8d089c5fc392495d418123e48 --- /dev/null +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -0,0 +1,406 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/transform.h" + +#ifdef __NVCC__ +#include +#endif + +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +/* + * Out = X ⊙ Y + * If Y's shape does not match X' shape, they will be reshaped. + * For example: + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5) + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + * x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20) + */ +inline void get_mid_dims(const framework::DDim& x_dims, + const framework::DDim& y_dims, const int axis, + int& pre, int& n, int& post) { + pre = 1; + n = 1; + post = 1; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + + for (int i = 0; i < y_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], + "Broadcast dimension mismatch."); + n *= y_dims[i]; + } + + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + post *= x_dims[i]; + } +} + +template +class RowwiseTransformIterator; +template +class MidWiseTransformIterator; + +template +class RowwiseTransformIterator { + public: + RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} + + RowwiseTransformIterator& operator++() { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + return *this; + } + + bool operator==(const RowwiseTransformIterator& + rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const RowwiseTransformIterator& + rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + private: + const T* ptr_; + int i_; + int64_t n_; +}; + +template +class MidWiseTransformIterator { + public: + MidWiseTransformIterator(const T* ptr, int n, int post) + : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} + + MidWiseTransformIterator& operator++() { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + return *this; + } + + bool operator==(const MidWiseTransformIterator& + rhs) const { + return (ptr_ + i_) == &(*rhs); + } + + bool operator!=(const MidWiseTransformIterator& + rhs) const { + return (ptr_ + i_) != &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + private: + const T* ptr_; + int64_t i_; + int64_t j_; + int64_t n_; + int64_t post_; +}; + +#ifdef __NVCC__ +template +class RowwiseTransformIterator + : public thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> + super_t; + HOSTDEVICE RowwiseTransformIterator(const T* x, int n) + : super_t(x), begin_(x), n_(n){}; + friend class thrust::iterator_core_access; + + private: + unsigned int n_; + const T* begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (this->base() - begin_) % n_); + } +}; + +template +class MidWiseTransformIterator + : public thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> + super_t; + HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) + : super_t(x), begin_(x), n_(n), post_(post){}; + friend class thrust::iterator_core_access; + + private: + unsigned int post_; + unsigned int n_; + const T* begin_; + HOSTDEVICE typename super_t::reference dereference() const { + return *(begin_ + (((this->base() - begin_) / post_) % n_)); + } +}; +#endif + +template +class TransformFunctor { + public: + TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z, const DeviceContext& ctx, Functor func) + : x_(x->data()), + y_(y->data()), + z_(z->mutable_data(ctx.GetPlace())), + nx_(x->numel()), + ctx_(ctx), + func_(func) {} + + inline void Run() const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, y_, z_, func_); + } + + inline void RunRowWise(int n, int pre) const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), + z_, func_); + } + + inline void RunMidWise(int n, int pre, int post) const { + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); + } + + private: + const T* x_; + const T* y_; + OutType* z_; + int64_t nx_; + const DeviceContext& ctx_; + Functor func_; +}; + +#define EIGEN_FUNCTOR(name, eigen_op) \ + struct Eigen##name##Functor { \ + template \ + inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ + framework::Tensor* z, \ + const framework::ExecutionContext& ctx) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_e); \ + } \ + template \ + inline void RunBroadCast(const framework::Tensor* x, \ + const framework::Tensor* y, framework::Tensor* z, \ + const framework::ExecutionContext& ctx, int pre, \ + int n) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ + .broadcast(Eigen::DSizes(pre, 1)) \ + .reshape(Eigen::DSizes(x_e.size())); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ + } \ + template \ + inline void RunBroadCast2(const framework::Tensor* x, \ + const framework::Tensor* y, \ + framework::Tensor* z, \ + const framework::ExecutionContext& ctx, int pre, \ + int n, int post) { \ + auto x_e = framework::EigenVector::Flatten(*x); \ + auto y_e = framework::EigenVector::Flatten(*y); \ + auto z_e = framework::EigenVector::Flatten(*z); \ + auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ + .broadcast(Eigen::DSizes(pre, 1, post)) \ + .reshape(Eigen::DSizes(x_e.size())); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ + } \ + } + +template +void ElementwiseCompute(const framework::ExecutionContext& ctx) { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + if (x_dims == y_dims) { + functor f; + f.template Run(x, y, z, ctx); + return; + } + + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + functor f; + f.template RunBroadCast(x, y, z, ctx, pre, n); + return; + } else { + functor f; + f.template RunBroadCast2(x, y, z, ctx, pre, n, post); + return; + } +} + +#define EIGEN_ADD(x, y) ((x) + (y)) +EIGEN_FUNCTOR(Add, EIGEN_ADD); + +#define EIGEN_SUB(x, y) ((x) - (y)) +EIGEN_FUNCTOR(Sub, EIGEN_SUB); + +#define EIGEN_MUL(x, y) ((x) * (y)) +EIGEN_FUNCTOR(Mul, EIGEN_MUL); + +#define EIGEN_DIV(x, y) ((x) / (y)) +EIGEN_FUNCTOR(Div, EIGEN_DIV); + +template +void ElementwiseGradCompute(const framework::ExecutionContext& ctx, + + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, int axis, + framework::Tensor* dx, framework::Tensor* dy) { + auto& place = *ctx.template device_context().eigen_device(); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + if (dx) { + dx->mutable_data(ctx.GetPlace()); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + } + + if (x_dims == y_dims) { + functor f; + f(place, x, y, out, dx, dy, dout); + return; + } + + if (y_dims.size() == 1 && y_dims[0] == 1) { + // y is a scalar + auto extended_dims = framework::vectorize(x_dims); + extended_dims.push_back(1); + x_dims = framework::make_ddim(extended_dims); + } + + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + + if (post == 1) { + broadcastfunctor f; + f(place, x, y, out, dx, dy, dout, pre, n); + return; + } else { + broadcast2functor f; + f(place, x, y, out, dx, dy, dout, pre, n, post); + return; + } +} + +template +void ElementwiseComputeEx(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, int axis, Functor func, + framework::Tensor* z) { + TransformFunctor functor( + x, y, z, ctx.template device_context(), func); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + if (x_dims == y_dims) { + functor.Run(); + return; + } + + if (y_dims.size() == 1 && y_dims[0] == 1) { + // y is a scalar + auto extended_dims = framework::vectorize(x_dims); + extended_dims.push_back(1); + x_dims = framework::make_ddim(extended_dims); + } + + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..911b5dbd2501e6c5ef6177a23592fadeb3383002 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_pow_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwisePowOpMaker : public ElementwiseOpMaker { + public: + ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Pow", "Out = X ^ Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp, + ops::ElementwisePowOpMaker); +REGISTER_OP_CPU_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/fluid/operators/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise_pow_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2996600738fe11c3fef67c3f4c5660ff05e37957 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_pow_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_pow, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); diff --git a/paddle/fluid/operators/elementwise_pow_op.h b/paddle/fluid/operators/elementwise_pow_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b793c1eae0ec3a796897c7d81ac061f80ccffdb6 --- /dev/null +++ b/paddle/fluid/operators/elementwise_pow_op.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct PowFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); } +}; + +template +class ElementwisePowKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + PowFunctor(), z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..46ce01c7cf5bf4930d05535e22f1d54073838071 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise_sub_op.h" +#include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseSubOpMaker : public ElementwiseOpMaker { + public: + ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : ElementwiseOpMaker(proto, op_checker) { + SetComment("Sub", "Out = X - Y"); + AddComment(comment_); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, + elementwise_sub_grad, ops::ElementwiseOpGrad); +REGISTER_OP_CPU_KERNEL( + elementwise_sub, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_sub_grad, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..eb09d6c5edcb6e8460de71d76077fd103d799847 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/elementwise_sub_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + elementwise_sub, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_sub_grad, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h new file mode 100644 index 0000000000000000000000000000000000000000..af2d497b9ae8f892aa272211ee2158d063d13909 --- /dev/null +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/elementwise_op_function.h" + +namespace paddle { +namespace operators { + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +class ElementwiseSubKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + SubFunctor(), z); + } +}; + +template +struct ElementwiseSubGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * dz_e; + } + } +}; + +template +struct ElementwiseSubBroadCastGradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * + dz_e.reshape(Eigen::DSizes(pre, n)) + .sum(Eigen::array{{0}}); + } + } +}; + +template +struct ElementwiseSubBroadCast2GradFunctor { + template + void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, + Post post) { + auto dz_e = framework::EigenVector::Flatten(*dz); + if (dx) { + auto dx_e = framework::EigenVector::Flatten(*dx); + dx_e.device(d) = dz_e; + } + + if (dy) { + auto dy_e = framework::EigenVector::Flatten(*dy); + dy_e.device(d) = (-1.0) * + dz_e.reshape(Eigen::DSizes(pre, n, post)) + .sum(Eigen::array{{0, 2}}); + } + } +}; + +template +class ElementwiseSubGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElementwiseGradCompute, + ElementwiseSubBroadCastGradFunctor, + ElementwiseSubBroadCast2GradFunctor>( + ctx, x, y, out, dout, axis, dx, dy); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccb9a94856fe868c8069510a7c557dfb8c22c369 --- /dev/null +++ b/paddle/fluid/operators/expand_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/expand_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class ExpandOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto x_dims = ctx->GetInputDim("X"); + + PADDLE_ENFORCE_EQ(static_cast(x_dims.size()), expand_times.size(), + "The number of Attr(expand_times)'s value must be equal " + "to the rank of Input(X)."); + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "The rank of Input(X) must not be greater than 6."); + + std::vector out_shape(x_dims.size()); + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_GE(expand_times[i], 1, + "Each value of Attr(expand_times) should not be " + "less than 1."); + out_shape[i] = x_dims[i] * expand_times[i]; + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_shape)); + if (out_shape[0] == x_dims[0]) { + ctx->ShareLoD("X", "Out"); + } + } +}; + +class ExpandOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor). A tensor with rank in [1, 6]." + "X is the input to be expanded."); + AddOutput("Out", + "(Tensor, default Tensor). A tensor with rank in [1, 6]." + "The rank of Output(Out) have the same with Input(X). " + "After expanding, size of each dimension of Output(Out) is equal " + "to size of the corresponding dimension of Input(X) multiplying " + "the corresponding value given by Attr(expand_times)."); + AddAttr>("expand_times", + "Expand times number for each dimension."); + AddComment(R"DOC( +Expand operator tiles the input by given times number. You should set times +number for each dimension by providing attribute 'expand_times'. The rank of X +should be in [1, 6]. Please note that size of 'expand_times' must be the same +with X's rank. Following is a using case: + +Input(X) is a 3-D tensor with shape [2, 3, 1]: + + [ + [[1], [2], [3]], + [[4], [5], [6]] + ] + +Attr(expand_times): [1, 2, 2] + +Output(Out) is a 3-D tensor with shape [2, 6, 2]: + + [ + [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], + [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] + ] + +)DOC"); + } +}; + +class ExpandGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], + "Each dimension size of Input(Out@GRAD) should be " + "equal to multiplication of crroresponding dimension " + "size of Input(X) and Attr(expand_times) value."); + } + + auto x_grad_name = framework::GradVarName("X"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, + ops::ExpandGradOp); +REGISTER_OP_CPU_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a9f39708beec3e0d32d65245c63f8ccf9df8604 --- /dev/null +++ b/paddle/fluid/operators/expand_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/expand_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8df1cd34d7dc5093b9bdcd3d015be4f9958d089d --- /dev/null +++ b/paddle/fluid/operators/expand_op.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +#define EXPAND_TEMPLATE(z, n, data) \ + case n + 1: { \ + Expand(context); \ + break; \ + } +#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) +#define COND(n) \ + BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \ + BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) +#define EXPAND_GRAD_CASE(n) \ + case n: { \ + ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ + } +#define EXPAND_GRAD_TEMPLATE(z, n, data) \ + BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) +#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~) + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenTensor = framework::EigenTensor; + +template +class ExpandKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + switch (rank) { + REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_ENFORCE(false, + "Only support tensor with rank being between 1 and 6."); + } + } + + protected: + template + void Expand(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto* out0 = context.Output("Out"); + Eigen::DSizes bcast_dims; + auto x_dims = in0->dims(); + for (size_t i = 0; i < expand_times.size(); ++i) { + bcast_dims[i] = expand_times[i]; + } + auto x = EigenTensor::From(*in0); + out0->mutable_data(context.GetPlace()); + auto y = EigenTensor::From(*out0); + auto& place = + *context.template device_context().eigen_device(); + y.device(place) = x.broadcast(bcast_dims); + } +}; + +template +class ExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto x_dims = in0->dims(); + // 1. reshape_dims_vec is the broadcast parameter. For each dimension i, + // if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two + // dimensions [expand_times[i], x_dims[i]]. + // 2. reduce_dims_vec is the dimension parameter to compute gradients. For + // each dimension expanded, the gradients should be summed to original + // size. + std::vector reshape_dims_vec; + std::vector reduce_dims_vec; + for (size_t i = 0; i < expand_times.size(); ++i) { + if (expand_times[i] == 1) { + reshape_dims_vec.push_back(x_dims[i]); + } else { + if (x_dims[i] == 1) { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + } else { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + reshape_dims_vec.push_back(x_dims[i]); + } + } + } + + int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED + + reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1; + // no need reduce, just copy + if (reduce_dims_vec.size() == 0) { + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + out0->mutable_data(context.GetPlace()); + framework::Copy(*in0, context.GetPlace(), context.device_context(), out0); + } else { + switch (dims) { + REP_EXPAND_GRAD_TEMPLATE(72) + default: + PADDLE_ENFORCE( + false, "Only support tensor with rank being between 1 and 6."); + } + } + } + + protected: + template + void ExpandBackward(const framework::ExecutionContext& context, + const std::vector& reshape_dims_vec, + const std::vector& reduce_dims_vec) const { + size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1; + size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1; + PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(), + "Inconsistent size between template Dims and " + "reshape dimensions."); + PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(), + "Inconsistent size between template Dims and " + "reduce dimensions."); + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + auto x = EigenVector::Flatten(*(context.Input("X"))); + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenVector::Flatten(*out0); + Eigen::DSizes reshape_dims; + for (size_t i = 0; i < reshape_size; ++i) { + reshape_dims[i] = reshape_dims_vec[i]; + } + Eigen::DSizes reduce_dims; + for (size_t i = 0; i < reduce_size; ++i) { + reduce_dims[i] = reduce_dims_vec[i]; + } + auto out_grad = EigenVector::Flatten(*in0); + x_grad.device( + *context.template device_context().eigen_device()) = + out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b3f5f0d1d09a932e15936285f5cb226daa86e95 --- /dev/null +++ b/paddle/fluid/operators/feed_op.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class FeedOp : public framework::OperatorBase { + public: + FeedOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto feed_var_name = Input("X"); + auto *feed_var = scope.FindVar(feed_var_name); + + PADDLE_ENFORCE(feed_var != nullptr, + "Cannot find feed_var in scope, feed_var_name is %s", + feed_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = Attr("col"); + + VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var " + << out_name; + + auto &feed_list = feed_var->Get(); + auto &feed_item = feed_list.at(static_cast(col)); + auto *out_item = out_var->GetMutable(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + if (platform::is_same_place(feed_item.place(), place)) { + out_item->ShareDataWith(feed_item); + } else { + framework::Copy(feed_item, place, dev_ctx, out_item); + } + out_item->set_lod(feed_item.lod()); + } +}; + +class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of feed op"); + AddOutput("Out", "The output of feed op"); + AddAttr("col", "(int) The column of feed"); + AddComment(R"DOC( +Feed Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(feed, paddle::operators::FeedOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FeedOpInfoMaker); diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54e5892016cdb01f50189147a7453b868c5a48c0 --- /dev/null +++ b/paddle/fluid/operators/fetch_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class FetchOp : public framework::OperatorBase { + public: + FetchOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto fetch_var_name = Input("X"); + auto *fetch_var = scope.FindVar(fetch_var_name); + PADDLE_ENFORCE(fetch_var != nullptr, + "Cannot find fetch variable in scope, fetch_var_name is %s", + fetch_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = static_cast(Attr("col")); + + auto *fetch_list = out_var->GetMutable(); + auto &src_item = fetch_var->Get(); + + if (col >= fetch_list->size()) { + fetch_list->resize(col + 1); + } + auto &dst_item = fetch_list->at(col); + + // FIXME(yuyang18): Should we assume the fetch operator always generate + // CPU outputs? + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(src_item.place()); + + Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item); + dev_ctx.Wait(); + dst_item.set_lod(src_item.lod()); + + VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; + } +}; + +class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of fetch op"); + AddOutput("Out", "The output of fetch op"); + AddAttr("col", "(int) The column of fetch"); + AddComment(R"DOC( +Fetch Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(fetch, paddle::operators::FetchOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FetchOpInfoMaker); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6992ba371c1dd61f7f6fa293be586818350fb3f --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h" + +namespace paddle { +namespace operators { + +class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Input"), + "Input(Input) of FillConstantBatchSizeLikeOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FillConstantBatchSizeLikeOp should not be null."); + + auto &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_GT(shape.size(), 0); + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + auto output_dim = framework::make_ddim(shape_int64); + + int input_dim_idx = ctx->Attrs().Get("input_dim_idx"); + PADDLE_ENFORCE_GE(input_dim_idx, 0); + PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx); + + int output_dim_idx = ctx->Attrs().Get("output_dim_idx"); + PADDLE_ENFORCE_GE(output_dim_idx, 0); + PADDLE_ENFORCE_GT(static_cast(shape.size()), output_dim_idx); + + output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx]; + ctx->SetOutputDim("Out", output_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class FillConstantBatchSizeLikeOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddInput("Input", + "(Tensor) Tensor " + "whose dim_idx th dimension is used to specify the batch_size"); + AddOutput("Out", + "(Tensor) Tensor of specified shape will be filled " + "with the specified value"); + AddAttr>("shape", "(vector) The shape of the output"); + AddAttr("input_dim_idx", + "(int, default 0) The index of input's batch size dimension") + .SetDefault(0); + AddAttr("output_dim_idx", + "(int, default 0) The index of output's batch size dimension") + .SetDefault(0); + AddAttr("value", "(float, default 0) The value to be filled") + .SetDefault(0.0f); + AddComment(R"DOC( +FillConstantBatchSizeLike Operator. + +Fill up a variable with specified constant value. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOp, + paddle::framework::EmptyGradOpMaker, + ops::FillConstantBatchSizeLikeOpMaker); +REGISTER_OP_CPU_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4f4d2a50305e2582f23ceed931d655f9690e110 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..da4a20d99a13533019d57fca42b1b49780200b79 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto value = ctx.Attr("value"); + + math::SetConstant setter; + setter(ctx.template device_context(), out, + static_cast(value)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4bf6406e5716a6b65a234d1cd642b64dcc5726f --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class FillConstantInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FillConstantOp should not be null."); + auto &shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } +}; + +class FillConstantOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto data_type = + static_cast(Attr("dtype")); + auto value = Attr("value"); + auto force_cpu = Attr("force_cpu"); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + out.Resize(framework::make_ddim(Attr>("shape"))); + if (force_cpu) { + auto cpu = platform::CPUPlace(); + out.mutable_data(cpu, framework::ToTypeIndex(data_type)); + } else { + out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + math::set_constant(dev_ctx, &out, value); + } +}; + +class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddAttr>("shape", "(vector) The shape of the output"); + AddAttr("value", "(float, default 0) The value to be filled") + .SetDefault(0.0f); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false); + AddOutput("Out", + "(Tensor) Tensor of specified shape will be filled " + "with the specified value"); + AddComment(R"DOC( +FillConstantBatchSizeLike Operator. + +Fill up a variable with specified constant value. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, + ops::FillConstantInferShape, ops::FillConstantOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8e318f37cf0bc945597b5aa7b384e53038c97786 --- /dev/null +++ b/paddle/fluid/operators/fill_op.cc @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct FillOpVisitor { + FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) + : tensor_(tensor), value_(value) {} + + template + void operator()() const { + platform::CPUPlace cpu; + auto *data = tensor_->mutable_data(cpu); + std::transform(value_.data(), value_.data() + tensor_->numel(), data, + [](float dat) { return static_cast(dat); }); + } + + framework::LoDTensor *tensor_; + const std::vector &value_; +}; + +class FillOp : public framework::OperatorBase { + public: + FillOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &out = + detail::Ref(detail::Ref(scope.FindVar(Output("Out")), + "Cannot find variable %s", Output("Out")) + .GetMutable()); + out.Resize(framework::make_ddim(Attr>("shape"))); + auto dtype = static_cast(Attr("dtype")); + platform::CPUPlace cpu; + auto force_cpu = Attr("force_cpu"); + out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype)); + + framework::LoDTensor tensor; + + if (force_cpu || platform::is_cpu_place(place)) { + tensor.ShareDataWith(out); + } else { + // Always make tensor in CPU memory. + tensor.Resize(out.dims()); + tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + } + + framework::VisitDataType( + dtype, FillOpVisitor(&tensor, Attr>("value"))); + + if (!force_cpu && platform::is_gpu_place(place)) { + // Copy tensor to out + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(tensor, place, dev_ctx, &out); + } + } +}; + +class FillOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC(Fill operator + +Fill an tensor with `value` and `shape`. The type of the tensor is specify by +`dtype`. +)DOC"); + AddOutput("Out", "(LoDTensor) The output tensor."); + AddAttr>( + "value", "The float values of tensor, which are flatten in row major"); + AddAttr>("shape", "The shape of output tensor"); + AddAttr("dtype", "The data type of output tensor, Default is float") + .SetDefault(framework::proto::DataType::FP32); + AddAttr("force_cpu", + "Whether the output tensor must be at CPU memory or not. " + "Default is false.") + .SetDefault(false); + } +}; + +class FillOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim( + "Out", + framework::make_ddim(context->Attrs().Get>("shape"))); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..958bfb1557d9fa39534caef594818aa97bbe03a6 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_zeros_like_op.h" + +namespace paddle { +namespace operators { + +class FillZerosLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FillZerosLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FillZerosLikeOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of fill-zeros-like op."); + AddOutput("Out", "The variable will be filled up with zeros."); + AddComment(R"DOC( +FillZerosLike Operator. + +Fill up a variable with zeros. +The output will have the same size as the input. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, + ops::FillZerosLikeOpMaker); +REGISTER_OP_CPU_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..07078573d8aaa1d72f876f4be68ff70d8a56d8a1 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_zeros_like_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..141c3809e9aa3e2984bf802418f8ddf7d92fa446 --- /dev/null +++ b/paddle/fluid/operators/fill_zeros_like_op.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class FillZerosLikeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + math::SetConstant setter; + setter(context.template device_context(), out, + static_cast(0)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e72a173751e9b163b6083df474c2b46c76ed459d --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/ftrl_op.h" + +namespace paddle { +namespace operators { + +class FTRLOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"), + "Input(SquaredAccumulator) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"), + "Input(LinearAccumulator) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of FTRL should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"), + "Output(SquaredAccumOut) of FTRL should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"), + "Output(LinearAccumOut) of FTRL should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"), + "Two input of FTRL Op's dimension must be same."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("SquaredAccumOut", param_dim); + ctx->SetOutputDim("LinearAccumOut", param_dim); + } +}; + +class FTRLOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("SquaredAccumulator", + "(Tensor, default Tensor) " + "Accumulator that accumulates squared gradients."); + AddInput("LinearAccumulator", + "(Tensor, default Tensor) " + "Accumulator that accumulates linear gradients."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("SquaredAccumOut", + "(Tensor) Output accumulated squared" + " gradients."); + AddOutput("LinearAccumOut", + "(Tensor) Output accumulated linear" + " gradients."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddAttr("lr_power", + "(float, default -0.5f) " + "Learning Rate Power.") + .SetDefault(-0.5f); + AddComment(R"DOC( +FTRL (Follow The Regularized Leader) Operator. + +Optimizer that implements the FTRL algorithm: + +$$ +new\_accum = squared\_accum + grad^2 \\ +if (lr\_power == -0.5) { + linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) / + (learning\_rate * param) \\ +} else { + linear\_accum += grad - + (new\_accum^{-lr\_power} - accum^{-lr\_power}) / + (learning\_rate * param) \\ +} + +x = (l1 * sign(linear\_accum) - linear\_accum) +if (lr\_power == -0.5) { + y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\ + pre\_shrink = \frac{x}{y} \\ + param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\ +} else { + y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\ + pre\_shrink = \frac{x}{y} \\ + param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\ +} +squared\_accum += grad^2; +$$ + +The paper that proposed Follow The Regularized Leader (FTRL): +(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); +REGISTER_OP_CPU_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/ftrl_op.cu b/paddle/fluid/operators/ftrl_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..dbdfcb927e0aff373a716c5e0eace96bec38e9ad --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/ftrl_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0a9405fcef1fa405ab14ba7c797b99c2259892f7 --- /dev/null +++ b/paddle/fluid/operators/ftrl_op.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class FTRLOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* sq_accum_out = ctx.Output("SquaredAccumOut"); + auto* lin_accum_out = ctx.Output("LinearAccumOut"); + + param_out->mutable_data(ctx.GetPlace()); + sq_accum_out->mutable_data(ctx.GetPlace()); + lin_accum_out->mutable_data(ctx.GetPlace()); + + auto grad = ctx.Input("Grad"); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + auto lr_power = static_cast(ctx.Attr("lr_power")); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto sq_accum = + EigenVector::Flatten(*ctx.Input("SquaredAccumulator")); + auto lin_accum = + EigenVector::Flatten(*ctx.Input("LinearAccumulator")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto s_acc_out = EigenVector::Flatten(*sq_accum_out); + auto l_acc_out = EigenVector::Flatten(*lin_accum_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + auto new_accum = sq_accum + g * g; + // Special case for lr_power = -0.5 + if (lr_power == static_cast(-0.5)) { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p; + } else { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) / + lr.broadcast(grad_dsize)) * + p; + } + + auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out); + if (lr_power == static_cast(-0.5)) { + auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } else { + auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } + + s_acc_out.device(place) = sq_accum + g * g; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..af5898e29ecaed5a4d2cf8372a3bb20f192fc776 --- /dev/null +++ b/paddle/fluid/operators/gather.cu.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::DeviceContext; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output, + size_t index_size, size_t slice_size) { + CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + int gather_i = indices[indices_i]; + int params_i = gather_i * slice_size + slice_i; + *(output + i) = *(params + params_i); + } +} + +/** + * A thin wrapper on gpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + // PADDLE_ENFORCE(platform::is_gpu_place(place)); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + int block = 512; + int n = slice_size * index_size; + int grid = (n + block - 1) / block; + + GatherCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_src, p_index, p_output, index_size, slice_size); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h new file mode 100644 index 0000000000000000000000000000000000000000..287732eeb6e5249f631bc3e39cd18bc050f9fc3b --- /dev/null +++ b/paddle/fluid/operators/gather.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +/** + * A thin wrapper for gathering on cpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dceeb71ee3552bcb462014b5f08a59d4406497ad --- /dev/null +++ b/paddle/fluid/operators/gather_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class GatherOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of GatherOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Index"), + "Input(Index) of GatherOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of GatherOp should not be null."); + + auto index_dims = ctx->GetInputDim("Index"); + PADDLE_ENFORCE(index_dims.size() == 1); + int batch_size = ctx->GetInputDim("Index")[0]; + PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0"); + framework::DDim output_dims(ctx->GetInputDim("X")); + output_dims[0] = batch_size; + ctx->SetOutputDim("Out", output_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class GatherGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class GatherOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The source input of gather op"); + AddInput("Index", "The index input of gather op"); + AddOutput("Out", "The output of gather op"); + AddComment(R"DOC( +Gather Operator. + +$Out = X[Index]$ + +Out is obtained by gathering entries of the outer-most dimension +of X indexed by Index and concatenate them together. + +Example: + +X = [[1, 2], + [3, 4], + [5, 6]] + +Index = [[1, 2]] + +Then: + +Out = [[3, 4], + [5, 6]] + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad, + ops::GatherGradOp); +REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel); +REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..484f4232624e862aff7c0aff337b4e5df65d5be3 --- /dev/null +++ b/paddle/fluid/operators/gather_op.cu @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gather.cu.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/gather_op.h" +#include "scatter.cu.h" + +namespace paddle { +namespace operators { + +template +class GatherOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + + GPUGather(ctx.device_context(), *x, *index, output); + } +}; + +template +class GatherGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *Index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + auto *x = ctx.Input("X"); + + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + + GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7ba4a31c81be025978c6c2a325792eea2eb353a7 --- /dev/null +++ b/paddle/fluid/operators/gather_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gather.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class GatherOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + + CPUGather(ctx.device_context(), *x, *index, output); + } +}; + +template +class GatherGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + + auto *Index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + + ScatterAssign(ctx.device_context(), *dO, *Index, dX); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d86cf5ce334705d16435f542cc33be454edabb7 --- /dev/null +++ b/paddle/fluid/operators/gather_test.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#include + +TEST(Gather, GatherData) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + Tensor* src = new Tensor(); + Tensor* index = new Tensor(); + Tensor* output = new Tensor(); + + int* p_src = nullptr; + int* p_index = nullptr; + p_src = src->mutable_data(make_ddim({3, 4}), CPUPlace()); + p_index = index->mutable_data(make_ddim({2}), CPUPlace()); + + for (int i = 0; i < 12; ++i) p_src[i] = i; + p_index[0] = 1; + p_index[1] = 0; + + int* p_output = output->mutable_data(make_ddim({2, 4}), CPUPlace()); + + auto* cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext ctx(*cpu_place); + CPUGather(ctx, *src, *index, output); + + for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); + for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); + + delete src; + delete index; + delete output; +} diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b090f8759765039eadfc900361bcdabe215c2225 --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CPUGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + + unsigned int seed = static_cast(context.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::normal_distribution dist(mean, std); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + } +}; + +class GaussianRandomOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of GaussianRandomOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + std::vector temp; + temp.reserve(shape.size()); + for (auto dim : shape) { + temp.push_back(static_cast(dim)); + } + PADDLE_ENFORCE(shape.size() > 0UL, + "shape can be one int or array. shape must be set."); + ctx->SetOutputDim("Out", framework::make_ddim(temp)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "Output matrix of gaussian random op"); + + AddAttr>("shape", + "(vector) " + "The dimension of random tensor."); + AddAttr("mean", + "(float, default 0.0) " + "mean of random tensor.") + .SetDefault(.0f); + AddAttr("std", + "(float, default 1.0) " + "std of random tensor.") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed of generator." + "0 means use system wide seed.") + .SetDefault(0); + AddAttr("dtype", + "(int, default 5(FP32)) " + "Output data type.") + .SetDefault(framework::proto::DataType::FP32); + + AddComment(R"DOC( +GaussianRandom Operator. + +Used to initialize tensors with gaussian random generator. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, + ops::GaussianRandomOpMaker); +REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..70d655d4bb259bf33765fa42e46a19510ffca35d --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct GaussianGenerator { + T mean_, std_; + unsigned int seed_; + + __host__ __device__ GaussianGenerator(T mean, T std, int seed) + : mean_(mean), std_(std), seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::normal_distribution dist(mean_, std_); + rng.discard(n); + return dist(rng); + } +}; + +template +class GPUGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); + if (seed == 0) { + std::random_device rd; + seed = rd(); + } + T mean = static_cast(context.Attr("mean")); + T std = static_cast(context.Attr("std")); + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + GaussianGenerator(mean, std, seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(gaussian_random, + paddle::operators::GPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba908e472bbc165a244d8543713f1dbf293abb48 --- /dev/null +++ b/paddle/fluid/operators/get_places_op.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif + +namespace paddle { +namespace operators { + +static size_t CUDADevCount() { +#ifdef PADDLE_WITH_CUDA + return platform::GetCUDADeviceCount(); +#else + return 0UL; +#endif +} + +class GetPlacesOp : public framework::OperatorBase { + public: + GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + bool is_gpu; + if (Attr("device_type") == "AUTO") { + is_gpu = platform::is_gpu_place(place); + } else { + is_gpu = Attr("device_type") == "CUDA"; + } + auto device_count = static_cast(Attr("device_count")); + if (device_count == 0) { + device_count = + is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); + } + PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count", + is_gpu ? "GPU" : "CPU"); + + auto out_var_name = Output("Out"); + auto &places = + *(detail::Ref(scope.FindVar(out_var_name), + "Output variable %s cannot be found", out_var_name) + .GetMutable()); + places.reserve(device_count); + if (is_gpu) { + PADDLE_ENFORCE_LE(device_count, CUDADevCount(), + "Only %d CUDA devices found, cannot set to %d", + CUDADevCount(), device_count); + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CUDAPlace(static_cast(i))); + } + } else { + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CPUPlace()); + } + } + } +}; + +class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "vector of Place"); + AddAttr("device_count", "device count").SetDefault(0); + AddAttr("device_type", "device type") + .InEnum({"CUDA", "CPU", "AUTO"}) + .SetDefault("AUTO"); + AddComment(R"DOC( +Returns a list of places based on flags. The list will be used for parallel +execution. +)DOC"); + } +}; + +class GetPlacesInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o_name : op_desc.Output("Out")) { + block->FindRecursiveOrCreateVar(o_name).SetType( + framework::proto::VarDesc::PLACE_LIST); + } + } +}; + +class GetPlacesInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + // Do nothing + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker, + ops::GetPlacesInferVarType, ops::GetPlacesInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1436e55b0e13b8b327a61e7c91294fa958b146c4 --- /dev/null +++ b/paddle/fluid/operators/gru_op.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class GRUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(%s) of GRUOp should not be null.", "BatchGate"); + PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), + "Output(%s) of GRUOp should not be null.", + "BatchResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(%s) of GRUOp should not be null.", "BatchHidden"); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(%s) of GRUOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto weight_dims = ctx->GetInputDim("Weight"); + int input_size = input_dims[1]; + int frame_size = weight_dims[0]; + PADDLE_ENFORCE_EQ(input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUOp."); + PADDLE_ENFORCE_EQ( + weight_dims[1], frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + } + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + ctx->SetOutputDim("BatchGate", input_dims); + ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size}); + ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", {input_dims[0], frame_size}); + ctx->ShareLoD("Input", "Hidden"); + } +}; + +class GRUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) The first input is a LodTensor, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTenosr is a matrix with shape (T X 3D), where, T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) The initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size, D is the hidden size.") + .AsDispensable(); + AddInput( + "Weight", + "(Tensor) The learnable hidden-hidden weight matrix with shape " + "(D x 3D), where D is the hidden size. The elements continuous in " + "memory can be divided into two parts. The first part are weights of " + "the update gate and reset gate with shape (D x 2D), and the second " + "part are weights of output candidate with shape (D x D)."); + AddInput("Bias", + "(Tensor, optional) Bias vector with shape (1 x 3D) concating " + "bias of the update gate, reset gate and output candidate.") + .AsDispensable(); + AddOutput("BatchGate", + "(LoDTensor) To compute with batches, sequence data will be " + "reorganized into several successive batches each containing " + "data from the same time step. The LoDTensor BatchGate contains " + "the update gate, reset gate and output candidate values " + "organized in batches. The LoD size is 2. The first LoD contains " + "the batch offsets and the second LoD contains the indexes in " + "the raw sequence data.") + .AsIntermediate(); + AddOutput( + "BatchResetHiddenPrev", + "(LoDTensor) The reseted hidden state LoDTensor organized in batches. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`.") + .AsIntermediate(); + AddOutput( + "BatchHidden", + "(LoDTensor) The hidden state LoDTensor organized in batches. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`.") + .AsIntermediate(); + AddOutput( + "Hidden", + "(LoDTensor) the hidden state LoDTensor organized in sequences. " + "This LoDTensor is a matrix with shape (T X D) and has the same LoD " + "with `BatchGate`."); + AddAttr("activation", + "(string, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault("tanh"); + AddAttr( + "gate_activation", + "(string, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault("sigmoid"); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed GRU.") + .SetDefault(false); + AddComment(R"DOC( +GRU Operator implements part calculations of the complete GRU as following: + +$$ +update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ +reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ +output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ +output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) +$$ + +@note To implement the complete GRU, fully-connected operator must be used +before to feed xu, xr and xc as the Input of GRU operator. +)DOC"); + } +}; + +class GRUGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUGradOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUGradOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(%s) of GRUGradOp should not be null.", "BatchGate"); + PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"), + "Input(%s) of GRUGradOp should not be null.", + "BatchResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("BatchHidden"), + "Input(%s) of GRUOp should not be null.", "BatchHidden"); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(%s) of GRUGradOp should not be null.", "Hidden"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto weight_dims = ctx->GetInputDim("Weight"); + int input_size = input_dims[1]; + int frame_size = weight_dims[0]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ(input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + auto h0_grad_name = framework::GradVarName("H0"); + if (ctx->HasOutput(h0_grad_name)) + ctx->SetOutputDim(h0_grad_name, h0_dims); + } + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + auto input_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(input_grad_name)) + ctx->SetOutputDim(input_grad_name, input_dims); + auto weight_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(weight_grad_name)) + ctx->SetOutputDim(weight_grad_name, weight_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp); +REGISTER_OP_CPU_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CPU_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e908d01d2920af8bdbbdc694944e62a86bad327a --- /dev/null +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h new file mode 100644 index 0000000000000000000000000000000000000000..37f3ae1a837c77bd5e3696abbd9ae14257a7f5d7 --- /dev/null +++ b/paddle/fluid/operators/gru_op.h @@ -0,0 +1,261 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); +} + +template +class GRUKernel : public framework::OpKernel { + public: + void BatchCompute(const framework::ExecutionContext& context) const { + auto* input = context.Input("Input"); + auto* h0 = context.Input("H0"); + auto* weight = context.Input("Weight"); + const T* weight_data = weight->data(); + auto* bias = context.Input("Bias"); + auto* batch_gate = context.Output("BatchGate"); + batch_gate->mutable_data(context.GetPlace()); + auto* batch_reset_hidden_prev = + context.Output("BatchResetHiddenPrev"); + batch_reset_hidden_prev->mutable_data(context.GetPlace()); + auto* batch_hidden = context.Output("BatchHidden"); + batch_hidden->mutable_data(context.GetPlace()); + auto* hidden = context.Output("Hidden"); + hidden->mutable_data(context.GetPlace()); + + context.ShareLoD("Input", "Hidden"); + + auto hidden_dims = hidden->dims(); + + bool is_reverse = context.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = context.template device_context(); + to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); + + if (bias) { + math::RowwiseAdd add_bias; + add_bias(dev_ctx, *batch_gate, *bias, batch_gate); + } + + int frame_size = hidden_dims[1]; + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = + const_cast(weight_data + 2 * frame_size * frame_size); + Tensor ordered_h0; + + framework::Vector order(batch_gate->lod()[2]); + + if (h0) { + // Since the batch computing for GRU reorders the input sequences + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState( + context.template device_context(), *h0, order, + &ordered_h0, true); + gru_value.prev_out_value = ordered_h0.data(); + } else { + gru_value.prev_out_value = nullptr; + } + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + math::GRUUnitFunctor::compute( + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); + gru_value.prev_out_value = gru_value.output_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden->set_lod(batch_gate->lod()); + to_seq(dev_ctx, *batch_hidden, *hidden); + } + + void Compute(const framework::ExecutionContext& context) const override { + BatchCompute(context); + } +}; + +template +class GRUGradKernel : public framework::OpKernel { + public: + void BatchCompute(const framework::ExecutionContext& context) const { + auto* h0 = context.Input("H0"); + auto* weight = context.Input("Weight"); + const T* weight_data = weight->data(); + auto* batch_gate = context.Input("BatchGate"); + auto* batch_reset_hidden_prev = + context.Input("BatchResetHiddenPrev"); + auto* batch_hidden = context.Input("BatchHidden"); + auto* hidden = context.Input("Hidden"); + auto* hidden_grad = + context.Input(framework::GradVarName("Hidden")); + auto* input_grad = + context.Output(framework::GradVarName("Input")); + auto* h0_grad = context.Output(framework::GradVarName("H0")); + auto* weight_grad = + context.Output(framework::GradVarName("Weight")); + auto* bias_grad = context.Output(framework::GradVarName("Bias")); + + auto gate_dims = batch_gate->dims(); + auto hidden_dims = hidden->dims(); + int frame_size = hidden_dims[1]; + + math::LoDTensor2BatchFunctor to_batch; + LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; + batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); + batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); + batch_reset_hidden_prev_grad.mutable_data(hidden_dims, + context.GetPlace()); + math::SetConstant zero; + auto& dev_ctx = context.template device_context(); + zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); + zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); + zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); + + Tensor ordered_h0, ordered_h0_grad; + + framework::Vector order(batch_gate->lod()[2]); + + if (h0) { + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, + true); + } + if (h0_grad) { + ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); + zero(context.template device_context(), &ordered_h0_grad, + static_cast(0.0)); + } + + bool is_reverse = context.Attr("is_reverse"); + batch_hidden_grad.set_lod(batch_hidden->lod()); + to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); + + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = + const_cast(weight_data + 2 * frame_size * frame_size); + + math::GRUMetaGrad gru_grad; + if (weight_grad) { + gru_grad.gate_weight_grad = + weight_grad->mutable_data(context.GetPlace()); + zero(dev_ctx, weight_grad, static_cast(0.0)); + gru_grad.state_weight_grad = + weight_grad->data() + 2 * frame_size * frame_size; + } else { + gru_grad.gate_weight_grad = nullptr; + gru_grad.state_weight_grad = nullptr; + } + + auto batch_starts = batch_hidden_grad.lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batch_gate->Slice(bstart, bend); + gru_value.gate_value = gate_t.data(); + Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend); + gru_grad.output_grad = hidden_grad_t.data(); + Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend); + gru_grad.gate_grad = gate_grad_t.data(); + Tensor reset_hidden_prev_grad_t = + batch_reset_hidden_prev_grad.Slice(bstart, bend); + gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data(); + if (n == 0) { + gru_value.prev_out_value = h0 ? ordered_h0.data() : nullptr; + gru_grad.prev_out_grad = + h0 && h0_grad ? ordered_h0_grad.data() : nullptr; + } else { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); + gru_value.prev_out_value = hidden_prev_t.data(); + Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart); + gru_grad.prev_out_grad = hidden_prev_grad_t.data(); + } + + math::GRUUnitGradFunctor::compute( + dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, + active_gate); + } + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + math::Batch2LoDTensorFunctor to_seq; + batch_gate_grad.set_lod(batch_gate->lod()); + to_seq(dev_ctx, batch_gate_grad, *input_grad); + } + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + math::ColwiseSum col_sum; + col_sum(dev_ctx, batch_gate_grad, bias_grad); + } + if (h0 && h0_grad) { + ReorderInitState(dev_ctx, ordered_h0_grad, order, + h0_grad, false); + } + } + + void Compute(const framework::ExecutionContext& context) const override { + BatchCompute(context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..21ad3aeb492ee18d465edea6ec0fca7e49d1366b --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gru_unit_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class GRUUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasOutput("Gate"), + "Output(%s) of GRUUnitOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"), + "Output(%s) of GRUUnitOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(%s) of GRUUnitOp should not be null.", "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + ctx->SetOutputDim("Gate", {batch_size, frame_size * 3}); + ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size}); + ctx->SetOutputDim("Hidden", {batch_size, frame_size}); + } +}; + +class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "input."); + AddInput("HiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "states of previous time step."); + AddInput( + "Weight", + "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. " + "The elements continuous in memory can be divided into two parts. " + "The first part are weights of the update gate and reset gate " + "with shape [frame_size, frame_size * 2], and the second part are " + "weights of output candidate with shape [frame_size, frame_size]."); + AddInput( + "Bias", + "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating " + "bias of the update gate, reset gate and output candidate.") + .AsDispensable(); + AddOutput("Gate", + "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " + "output of update gate, reset gate and output candidate.") + .AsIntermediate(); + AddOutput("ResetHiddenPrev", + "(Tensor) Matrix with shape [batch_size, frame_size] for the " + "reseted hidden state of previous time step.") + .AsIntermediate(); + AddOutput("Hidden", + "(Tensor) The GRU hidden state of the current time step " + "with shape [batch_size, frame_size]."); + AddAttr("activation", + "(enum int, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault(tanh) + .InEnum({identity, sigmoid, tanh, relu}); + AddAttr("gate_activation", + "(enum int, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault(sigmoid) + .InEnum({identity, sigmoid, tanh, relu}); + AddComment(R"DOC( +GRUUnit Operator implements partial calculations of the GRU unit as following: + +$$ +update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ +reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ +output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ +output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) +$$ + +which is same as one time step of GRU Operator. + +@note To implement the complete GRU unit, fully-connected operator must be +used before to feed xu, xr and xc as the Input of GRUUnit operator. + +)DOC"); + } +}; + +class GRUUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(%s) of GRUUnitGradOp should not be null.", "Input"); + PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "HiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(%s) of GRUUnitGradOp should not be null.", "Weight"); + PADDLE_ENFORCE(ctx->HasInput("Gate"), + "Input(%s) of GRUUnitGradOp should not be null.", "Gate"); + PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"), + "Input(%s) of GRUUnitGradOp should not be null.", + "ResetHiddenPrev"); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(%s) of GRUUnitGradOp should not be null.", "Hidden"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), + "Input(%s@GRAD) of GRUUnitGradOp should not be null.", + "Hidden"); + auto input_dims = ctx->GetInputDim("Input"); + auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev"); + auto weight_dims = ctx->GetInputDim("Weight"); + // int batch_size = input_dims[0]; + int input_size = input_dims[1]; + int frame_size = hidden_prev_dims[1]; + int weight_height = weight_dims[0]; + int weight_width = weight_dims[1]; + PADDLE_ENFORCE_EQ( + input_size, frame_size * 3, + "The input_size must be 3 times of frame_size in GRUUnitOp."); + PADDLE_ENFORCE_EQ( + weight_height, frame_size, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + PADDLE_ENFORCE_EQ( + weight_width, frame_size * 3, + "The shape of Weight matrix must be [frame_size, frame_size * 3]."); + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + int bias_height = bias_dims[0]; + int bias_width = bias_dims[1]; + PADDLE_ENFORCE_EQ(bias_height, 1, + "The shape of Bias must be [1, frame_size * 3]."); + PADDLE_ENFORCE_EQ(bias_width, frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + auto input_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(input_grad_name)) + ctx->SetOutputDim(input_grad_name, input_dims); + auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev"); + if (ctx->HasOutput(hidden_prev_grad_name)) + ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims); + auto weight_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(weight_grad_name)) + ctx->SetOutputDim(weight_grad_name, weight_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, + ops::GRUUnitGradOp); +REGISTER_OP_CPU_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..88b707fd1314ec5f12b507edc64a56ea9895a9d6 --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/gru_unit_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c4031a5a575e59488c1a6cd77c3da88ea6af423e --- /dev/null +++ b/paddle/fluid/operators/gru_unit_op.h @@ -0,0 +1,244 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + +enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; + +template +class GRUUnitKernel : public framework::OpKernel { + public: + template + void ActCompute(const int act_type, const Device& d, X x, Y y) const { + if (act_type == identity) + y.device(d) = x; + else if (act_type == sigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == tanh) + TanhFunctor()(d, x, y); + else if (act_type == relu) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* bias = context.Input("Bias"); + auto* gate = context.Output("Gate"); + gate->mutable_data(context.GetPlace()); + auto* reset_hidden_prev = context.Output("ResetHiddenPrev"); + reset_hidden_prev->mutable_data(context.GetPlace()); + auto* hidden = context.Output("Hidden"); + hidden->mutable_data(context.GetPlace()); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + auto x = EigenMatrix::From(*input); + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto r_h_p = EigenMatrix::From(*reset_hidden_prev); + auto h = EigenMatrix::From(*hidden); + auto& place = + *context.template device_context().eigen_device(); + + // calculate unactivated gate outputs + if (bias) { + auto b = EigenMatrix::From(*bias); + g.device(place) = x + + b.reshape(Eigen::array({{1, frame_size * 3}})) + .broadcast(Eigen::array({{batch_size, 1}})); + } else { + g.device(place) = x; + } + const T* hidden_prev_data = hidden_prev->data(); + const T* weight_data = weight->data(); + T* gate_data = gate->data(); + T* reset_hidden_prev_data = reset_hidden_prev->data(); + math::gemm( + context.template device_context(), false, false, + batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, + weight_data, frame_size * 2, 1, gate_data, frame_size * 3); + + // calculate activited gate + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(u_offsets, extents), g.slice(u_offsets, extents)); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + ActCompute(context.Attr("gate_activation"), place, + g.slice(r_offsets, extents), g.slice(r_offsets, extents)); + auto r = g.slice(r_offsets, extents); // reset gate + r_h_p.device(place) = r * h_p; // reset previous hidden state + math::gemm( + context.template device_context(), false, false, + batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, + gate_data + frame_size * 2, frame_size * 3); + + Eigen::array c_offsets({{0, frame_size * 2}}); + ActCompute(context.Attr("activation"), place, + g.slice(c_offsets, extents), g.slice(c_offsets, extents)); + auto c = g.slice(c_offsets, extents); // output candidate + + // calculate final output + h.device(place) = u * (c - h_p) + h_p; + } +}; + +template +class GRUUnitGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx, + DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == identity) + dx.device(d) = dy; + else if (act_type == sigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == tanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == relu) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + auto* hidden_prev = context.Input("HiddenPrev"); + auto* weight = context.Input("Weight"); + auto* gate = context.Input("Gate"); + auto* reset_hidden_prev = context.Input("ResetHiddenPrev"); + auto* hidden_grad = context.Input(framework::GradVarName("Hidden")); + auto* input_grad = context.Output(framework::GradVarName("Input")); + auto* hidden_prev_grad = + context.Output(framework::GradVarName("HiddenPrev")); + auto* weight_grad = + context.Output(framework::GradVarName("Weight")); + auto* bias_grad = context.Output(framework::GradVarName("Bias")); + Tensor gate_grad; + Tensor reset_hidden_prev_grad; + + const T* hidden_prev_data = hidden_prev->data(); + const T* weight_data = weight->data(); + T* gate_grad_data = + gate_grad.mutable_data(input->dims(), context.GetPlace()); + const T* reset_hidden_prev_data = reset_hidden_prev->data(); + T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data( + reset_hidden_prev->dims(), context.GetPlace()); + + auto h_p = EigenMatrix::From(*hidden_prev); + auto g = EigenMatrix::From(*gate); + auto d_h = EigenMatrix::From(*hidden_grad); + auto d_g = EigenMatrix::From(gate_grad); + auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); + auto& place = + *context.template device_context().eigen_device(); + + int batch_size = input->dims()[0]; + int frame_size = hidden_prev->dims()[1]; + + Eigen::array extents({{batch_size, frame_size}}); + Eigen::array u_offsets({{0, 0}}); + auto u = g.slice(u_offsets, extents); // update gate + Eigen::array r_offsets({{0, frame_size}}); + auto r = g.slice(r_offsets, extents); // reset gate + Eigen::array c_offsets({{0, frame_size * 2}}); + auto c = g.slice(c_offsets, extents); // output candidate + + // backward for unactivated update gate + ActGradCompute(context.Attr("gate_activation"), place, u, u, + d_g.slice(u_offsets, extents), d_h * (c - h_p)); + // backward for unactivated output candidate + ActGradCompute(context.Attr("activation"), place, c, c, + d_g.slice(c_offsets, extents), d_h * u); + // backward for reset_hidden_prev + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, + frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); + // backward for unactivated reset gate + ActGradCompute(context.Attr("gate_activation"), place, r, r, + d_g.slice(r_offsets, extents), d_r_h_p * h_p); + // backward for weight + if (weight_grad) { + T* weight_grad_data = weight_grad->mutable_data(context.GetPlace()); + // backward for state_weight + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, + frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, + weight_grad_data + frame_size * frame_size * 2, frame_size); + + // backward for update_gate_weight and reset_gate_weight + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data, + frame_size * 2); + } + // backward for hidden_prev + if (hidden_prev_grad) { + T* hidden_prev_grad_data = + hidden_prev_grad->mutable_data(context.GetPlace()); + auto d_h_p = EigenMatrix::From(*hidden_prev_grad); + d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, + frame_size); + } + // backward for input + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + auto d_x = EigenMatrix::From(*input_grad); + d_x.device(place) = d_g; + } + // backward for bias + if (bias_grad) { + bias_grad->mutable_data(context.GetPlace()); + auto d_b = EigenVector::Flatten(*bias_grad); + d_b.device(place) = d_g.sum(Eigen::array({{0}})); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f644c22c9f1bdde6edc0126186361baccfbfcfb0 --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/hinge_loss_op.h" + +namespace paddle { +namespace operators { + +class HingeLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Logits) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Logits) contains a real value, " + "so the 2nd dimension of Input(Logits) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Logits", "Loss"); + } +}; + +template +class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "The input value (Logits) of Hinge loss op." + "Logits is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Hinge loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the hinge loss."); + AddComment(R"DOC( +HingeLoss Operator. + +Let x be a logit (prediction) and y be the actual label. The logit can +take any values from (-inf, inf), but the labels should be either -1 or 1. +Then, the hinge loss is computed as follows: + +$$ +L_(x, y) = max(1 - y.x, 0) +$$ + +Note that the labels passed as input will have values as either 0 or 1. + +)DOC"); + } +}; + +class HingeLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Input(Logits@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto lab_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Logits"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, + hinge_loss_grad, ops::HingeLossGradOp); +REGISTER_OP_CPU_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..cb53a9b7f4aaeeee71ed81c507feb0e9c946a541 --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/hinge_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1e924d236ea1d3208a8f425f76be8d455714a51f --- /dev/null +++ b/paddle/fluid/operators/hinge_loss_op.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class HingeLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* loss = context.Output("Loss"); + auto& place = + *context.template device_context().eigen_device(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + loss->mutable_data(context.GetPlace()); + auto l = framework::EigenVector::Flatten(*loss); + l.device(place) = + (static_cast(1) - x * (static_cast(2) * y - static_cast(1))) + .cwiseMax(static_cast(0)); + } +}; + +template +class HingeLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* dloss = + context.Input(framework::GradVarName("Loss")); + auto* dpred = + context.Output(framework::GradVarName("Logits")); + auto& place = + *context.template device_context().eigen_device(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + auto dl = framework::EigenVector::Flatten(*dloss); + + if (dpred) { + dpred->mutable_data(context.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + auto alt_labels = static_cast(2) * y - static_cast(1); + dx.device(place) = + dl * ((x * alt_labels) < static_cast(1)).template cast() * + (-alt_labels); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dc1f609dcfa23dff82812e72c16b1d62a93ca9a6 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/huber_loss_op.h" + +namespace paddle { +namespace operators { + +class HuberLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims, y_dims); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of Input(X) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(x_dims[1], 1, + "Each row of Input(X) contains a real value, " + "so the 2nd dimension of Input(X) must be 1."); + + ctx->SetOutputDim("Residual", x_dims); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->ShareLoD("X", "Out"); + } +}; + +template +class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input value of huber loss op." + "X is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Y", + "The target value of huber loss op." + "Y is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Residual", + "Intermediate tensor to cache residual value between Y and X." + "The shape is same as Input(X) and will be reused in backward.") + .AsIntermediate(); + AddOutput("Out", + "The output tensor with shape [batch_size, 1] " + "which represents the huber loss."); + AddAttr("delta", "Hyper parameter in huber loss."); + AddComment(R"DOC( +HuberLoss Operator. + +Huber loss is a loss function used in robust regression. We define X as the +input value and Y as the target value. Huber loss can evaluate the fitness of +X to Y. Different from MSE loss, Huber loss is more robust for outliers. The +shape of X and Y are [batch_size, 1]. The equation is: + +$$ +Out_{\delta}(X, Y)_i = +\begin{cases} +0.5 * (Y_i - X_i)^2, +\quad |Y_i - X_i| \leq \delta \\ +\delta * (|Y_i - X_i| - 0.5 * \delta), +\quad otherwise +\end{cases} +$$ + +In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith +element of Out, X and Y. + +)DOC"); + } +}; + +class HuberLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Residual"), + "Input(Residual) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto residual_dims = ctx->GetInputDim("Residual"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(residual_dims, x_dims); + PADDLE_ENFORCE_EQ(out_grad_dims, x_dims); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, + huber_loss_grad, ops::HuberLossGradOp); +REGISTER_OP_CPU_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CPU_KERNEL( + huber_loss_grad, + ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ef5120c69d4fda533625ace9bab504be39385ec9 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/huber_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CUDA_KERNEL( + huber_loss_grad, + ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..caca89fcf63d27c7717de522e38f6ff0cab0d8f6 --- /dev/null +++ b/paddle/fluid/operators/huber_loss_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +struct HuberLossForward { + HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val <= delta) { + return static_cast(0.5) * val * val; + } else { + return delta * (abs_val - static_cast(0.5) * delta); + } + } + + T delta; +}; + +template +class HuberLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("Residual"); + auto* out1 = context.Output("Out"); + auto delta = static_cast(context.Attr("delta")); + auto& place = + *context.template device_context().eigen_device(); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + out0->mutable_data(context.GetPlace()); + auto residual = EigenVector::Flatten(*out0); + residual.device(place) = y - x; + out1->mutable_data(context.GetPlace()); + auto loss = EigenVector::Flatten(*out1); + loss.device(place) = residual.unaryExpr(HuberLossForward(delta)); + } +}; + +template +struct HuberLossBackward { + HOSTDEVICE HuberLossBackward(const T& delta, T sign) + : sign(sign), delta(delta) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val <= delta) { + return sign * val; + } else { + if (val > 0) { + return sign * delta; + } else { + return -1 * sign * delta; + } + } + } + + T sign; + T delta; +}; + +template +class HuberLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Residual"); + auto* in1 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + auto* out1 = context.Output(framework::GradVarName("Y")); + auto delta = static_cast(context.op().Attr("delta")); + auto& place = + *context.template device_context().eigen_device(); + + auto residual = EigenVector::Flatten(*in0); + auto out_grad = EigenVector::Flatten(*in1); + + if (out0) { + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenVector::Flatten(*out0); + x_grad.device(place) = + out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + } + + if (out1) { + out1->mutable_data(context.GetPlace()); + auto y_grad = EigenVector::Flatten(*out1); + y_grad.device(place) = + out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..936e5fe49eda40dff6d8aa5fd626d443ee8dbe75 --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/im2sequence_op.h" + +namespace paddle { +namespace operators { + +class Im2SequenceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Im2SequenceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Im2SequenceOp op should not be null."); + + auto in_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(in_dim.size(), 4, + "Input(X) format must be 4D tensor, eg., NCHW."); + + auto kernels = ctx->Attrs().Get>("kernels"); + auto strides = ctx->Attrs().Get>("strides"); + auto paddings = ctx->Attrs().Get>("paddings"); + + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + ctx->SetOutputDim("Out", {batch_size * output_height * output_width, + img_channels * kernels[0] * kernels[1]}); + } +}; + +class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor has NCHW format." + "N: batch size" + "C: channels" + "H: height" + "W: width"); + AddOutput("Out", "(LodTensor) The output data of im2sequence op,"); + AddAttr>("kernels", + "(vector), the " + "kernels(kernel_height, kernel_width)"); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride)") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0, 0, 0}), the " + "paddings(up_pad, left_pad, down_pad, right_pad)") + .SetDefault({0, 0, 0, 0}); + AddComment(R"DOC( +This op uses kernels to scan images and converts these images to sequences. +After expanding, The number of time steps are output_height * output_width +and the dimension of each time step is kernel_height * kernel_width * channels, +in which: + +output_height = + 1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) / + stride_height; +output_width = + 1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) / + stride_width; + +This op can be used after convolution neural network, and before recurrent neural network. + +Given: + +x = [[[[ 6. 2. 1.] + [ 8. 3. 5.] + [ 0. 2. 6.]] + + [[ 2. 4. 4.] + [ 6. 3. 0.] + [ 6. 4. 7.]]] + + [[[ 6. 7. 1.] + [ 5. 7. 9.] + [ 2. 4. 8.]] + + [[ 1. 2. 1.] + [ 1. 3. 5.] + [ 9. 0. 8.]]]] +x.dims = {2, 2, 3, 3} + +And: + +kernels = [2, 2] +strides = [1, 1] +paddings = [0, 0, 0, 0] + +Then: + +output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] + [ 2. 1. 3. 5. 4. 4. 3. 0.] + [ 8. 3. 0. 2. 6. 3. 6. 4.] + [ 3. 5. 2. 6. 3. 0. 4. 7.] + [ 6. 7. 5. 7. 1. 2. 1. 3.] + [ 7. 1. 7. 9. 2. 1. 3. 5.] + [ 5. 7. 2. 4. 1. 3. 9. 0.] + [ 7. 9. 4. 8. 3. 5. 0. 8.]] +output.dims = {8, 9} +output.lod = [[0, 4, 8]] + +)DOC"); + } +}; + +class Im2SequenceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, + im2sequence_grad, ops::Im2SequenceGradOp); +REGISTER_OP_CPU_KERNEL( + im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CPU_KERNEL( + im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e7bf4631224620ad5c65f750ed0c0c22e936dcf --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/im2sequence_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CUDA_KERNEL( + im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h new file mode 100644 index 0000000000000000000000000000000000000000..59456f0ea2996bb20bd48806a7258e31518a5ea3 --- /dev/null +++ b/paddle/fluid/operators/im2sequence_op.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +inline int OutputSize(int input_size, int filter_size, int padding_0, + int padding_1, int stride) { + const int output_size = + (input_size + padding_0 + padding_1 - filter_size) / stride + 1; + return output_size; +} + +template +class Im2SequenceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* in = ctx.Input("X"); + LoDTensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + // TODO(wanghaoshuang): Add layout checker after 'set_layout' + // being available for python API + // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW, + // "Input(X) layout must be NCHW"); + auto in_dim = in->dims(); + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + auto kernels = ctx.Attr>("kernels"); + auto strides = ctx.Attr>("strides"); + auto paddings = ctx.Attr>("paddings"); + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + const std::vector dilations({1, 1}); + + auto out_dims = out->dims(); + out->Resize({batch_size, out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = out->Slice(i, i + 1).Resize( + {output_height, output_width, img_channels, kernels[0], kernels[1]}); + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + out->Resize(out_dims); + + // set lod information + // TODO(wanghaoshuang): Move this to InferShape + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + for (int i = 0, offset = 0; i < batch_size + 1; ++i) { + lod[0].push_back(offset); + offset += output_height * output_width; + } + out->set_lod(lod); + } +}; + +template +class Im2SequenceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + Tensor* d_out = + const_cast(ctx.Input(framework::GradVarName("Out"))); + auto* d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); + + auto x_v = framework::EigenVector::Flatten(*d_x); + auto& place = *ctx.template device_context().eigen_device(); + x_v.device(place) = x_v.constant(0.0); + + auto in_dim = in->dims(); + int batch_size = in_dim[0]; + int img_channels = in_dim[1]; + int img_height = in_dim[2]; + int img_width = in_dim[3]; + + auto kernels = ctx.Attr>("kernels"); + auto strides = ctx.Attr>("strides"); + auto paddings = ctx.Attr>("paddings"); + int output_height = OutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = + OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); + + const std::vector dilations({1, 1}); + + auto d_out_dims = d_out->dims(); + d_out->Resize({batch_size, d_out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + Tensor dst = + d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + const Tensor src = d_out->Slice(i, i + 1).Resize( + {output_height, output_width, img_channels, kernels[0], kernels[1]}); + math::Col2ImFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + d_out->Resize(d_out_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/fluid/operators/images/batch_norm_fork.dot similarity index 100% rename from paddle/operators/images/batch_norm_fork.dot rename to paddle/fluid/operators/images/batch_norm_fork.dot diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/fluid/operators/images/batch_norm_fork.png similarity index 100% rename from paddle/operators/images/batch_norm_fork.png rename to paddle/fluid/operators/images/batch_norm_fork.png diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/fluid/operators/images/batch_norm_op_kernel.png similarity index 100% rename from paddle/operators/images/batch_norm_op_kernel.png rename to paddle/fluid/operators/images/batch_norm_op_kernel.png diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d488067b254c37515c6bdb9a4589aad311f344f --- /dev/null +++ b/paddle/fluid/operators/increment_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class IncrementInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IncrementOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of IncrementOp should not be null."); + PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X"))); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +struct IncrementFunctor { + IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out, + float value) + : x_(x), out_(out), value_(value) {} + + template + void operator()() const { + *out_->data() = *x_.data() + static_cast(value_); + } + + const framework::LoDTensor &x_; + framework::LoDTensor *out_; + float value_; +}; + +class IncrementOp : public framework::OperatorBase { + public: + IncrementOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + + PADDLE_ENFORCE(platform::is_cpu_place(x.place())); + out.Resize(x.dims()); + out.mutable_data(x.place(), x.type()); + float value = Attr("step"); + VLOG(10) << Output("Out") << " increase " << Input("X") << " with " + << value; + framework::VisitDataType(framework::ToDataType(out.type()), + IncrementFunctor(x, &out, value)); + } +}; + +class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input tensor of increment operator"); + AddOutput("Out", "(Tensor) The output tensor of increment operator."); + AddAttr("step", + "(float, default 1.0) " + "The step size by which the " + "input tensor will be incremented.") + .SetDefault(1.0); + AddComment(R"DOC( +Increment Operator. + +The equation is: +$$Out = X + step$$ + +)DOC"); + } +}; + +class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("increment"); + grad_op->SetInput("X", Output("Out")); + grad_op->SetOutput("Out", Input("X")); + grad_op->SetAttr("step", -boost::get(GetAttr("step"))); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape, + ops::IncrementOpMaker, ops::IncrementGradOpMaker); diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc new file mode 100755 index 0000000000000000000000000000000000000000..c2e452cdfaa71cae53c2bfe259bae7f80cd259d7 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/iou_similarity_op.h" + +namespace paddle { +namespace operators { + +class IOUSimilarityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IOUSimilarityOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of IOUSimilarityOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2."); + PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]"); + + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]})); + } +}; + +class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, default LoDTensor) " + "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "the shape of X is [N, 4]. [xmin, ymin] is the left top " + "coordinate of the box if the input is image feature map, they " + "are close to the origin of the coordinate system. " + "[xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddInput("Y", + "(Tensor, default Tensor) " + "Box list Y holds M boxes, each box is represented as " + "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. " + "[xmin, ymin] is the left top coordinate of the box if the " + "input is image feature map, and [xmax, ymax] is the right " + "bottom coordinate of the box."); + + AddOutput("Out", + "(LoDTensor, the lod is same as input X) The output of " + "iou_similarity op, a tensor with shape [N, M] " + "representing pairwise iou scores."); + + AddComment(R"DOC( +IOU Similarity Operator. +Computes intersection-over-union (IOU) between two box lists. + Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, + boxes in 'Y' are shared by all instance of the batched inputs of X. + Given two boxes A and B, the calculation of IOU is as follows: + +$$ +IOU(A, B) = +\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp, + ops::IOUSimilarityOpMaker); + +REGISTER_OP_CPU_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu new file mode 100755 index 0000000000000000000000000000000000000000..f8df1f4aa4c4894b59fe373a9e0cb697dfb96b62 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/iou_similarity_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/iou_similarity_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2fb1b5f70703f9c88a532644d88a8b5df45404f0 --- /dev/null +++ b/paddle/fluid/operators/iou_similarity_op.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +template +inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2, + T ymin2, T xmax2, T ymax2) { + constexpr T zero = static_cast(0); + T area1 = (ymax1 - ymin1) * (xmax1 - xmin1); + T area2 = (ymax2 - ymin2) * (xmax2 - xmin2); + T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1; + T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1; + T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2; + T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2; + T inter_height = inter_ymax - inter_ymin; + T inter_width = inter_xmax - inter_xmin; + inter_height = inter_height > zero ? inter_height : zero; + inter_width = inter_width > zero ? inter_width : zero; + T inter_area = inter_width * inter_height; + T union_area = area1 + area2 - inter_area; + T sim_score = inter_area / union_area; + return sim_score; +} + +template +struct IOUSimilarityFunctor { + IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols) + : x_(x), y_(y), z_(z), cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + T x_min1 = x_[row_id * 4]; + T y_min1 = x_[row_id * 4 + 1]; + T x_max1 = x_[row_id * 4 + 2]; + T y_max1 = x_[row_id * 4 + 3]; + for (size_t i = 0; i < cols_; ++i) { + T x_min2 = y_[i * 4]; + T y_min2 = y_[i * 4 + 1]; + T x_max2 = y_[i * 4 + 2]; + T y_max2 = y_[i * 4 + 3]; + + T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2, + x_max2, y_max2); + + z_[row_id * cols_ + i] = sim; + } + } + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +namespace paddle { +namespace operators { + +template +class IOUSimilarityKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::LoDTensor* in_x = ctx.Input("X"); + const framework::Tensor* in_y = ctx.Input("Y"); + framework::LoDTensor* out = ctx.Output("Out"); + + int x_n = in_x->dims()[0]; + int y_n = in_y->dims()[0]; + IOUSimilarityFunctor functor(in_x->data(), in_y->data(), + out->mutable_data(ctx.GetPlace()), y_n); + + platform::ForRange for_range( + static_cast(ctx.device_context()), x_n); + for_range(functor); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea424018d66dac85d5a4ad75cbf5199064d52848 --- /dev/null +++ b/paddle/fluid/operators/is_empty_op.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +constexpr char kInput[] = "X"; +constexpr char kOutput[] = "Out"; + +class IsEmptyOp : public framework::OperatorBase { + public: + IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + // get input + auto *var = scope.FindVar(Input(kInput)); + PADDLE_ENFORCE_NOT_NULL(var); + auto &tensor = var->Get(); + // get output + auto *out = scope.FindVar(Output(kOutput)); + PADDLE_ENFORCE_NOT_NULL(out); + auto *out_tensor = out->GetMutable(); + + out_tensor->Resize({1}); + out_tensor->mutable_data(platform::CPUPlace())[0] = + framework::product(tensor.dims()) == 0; + } +}; + +class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInput, "(Tensor) Tensor which is to be checked."); + AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not."); + AddComment(R"DOC( +IsEmpty Operator which checks whether a tensor is empty. + +It will just return product(tensor.ddims()) > 0; + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp, + paddle::operators::IsEmptyOpProtoMaker); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..974ee404f8364ed66e9e213f857ea89993e1d6af --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/l1_norm_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class L1NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + ctx->SetOutputDim("Out", {1}); + } +}; + +class L1NormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +class L1NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of l1_norm op."); + AddOutput("Out", "(Scalar) The output of l1_norm op."); + AddComment(R"DOC( +L1 Norm Operator. + +Computes the L1 norm of a tensor. + +$$Out = \sum{|X|}$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad, + ops::L1NormGradOp); +REGISTER_OP_CPU_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e9e864a346298a670db63c664491c336a9bd36a --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/l1_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7ddf2ac6a9046d4d8c2130b459f2385ef4e1301a --- /dev/null +++ b/paddle/fluid/operators/l1_norm_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = sum(abs(X)) +template +class L1NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenScalar::From(*Out); + auto &place = + *context.template device_context().eigen_device(); + + out.device(place) = x.abs().sum(); + } +}; + +// dX = dout * sign(X) +template +class L1NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *x = context.Input("X"); + const framework::Tensor *d_out = + context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar"); + framework::Tensor *dx = + context.Output(framework::GradVarName("X")); + dx->mutable_data(context.GetPlace()); + + auto x_eigen = framework::EigenVector::Flatten(*x); + auto d_out_eigen = framework::EigenVector::Flatten(*d_out); + auto dx_eigen = framework::EigenVector::Flatten(*dx); + auto &place = + *context.template device_context().eigen_device(); + + Eigen::DSizes x_dsize(x->numel()); + dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c018965beefb362ae845d132e34ded1bb2911629 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/label_smooth_op.h" + +namespace paddle { +namespace operators { + +class LabelSmoothOp : public framework::OperatorWithKernel { + public: + LabelSmoothOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LabelSmoothOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LabelSmoothOp should not be null."); + auto in_dims = ctx->GetInputDim("X"); + if (ctx->HasInput("PriorDist")) { + auto noise_dims = ctx->GetInputDim("PriorDist"); + auto noise_numel = paddle::framework::product(noise_dims); + PADDLE_ENFORCE( + in_dims[1] == noise_numel, + "The number of elements in Input(PriorDist) must be equal to the " + "dimension of each label."); + } + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", in_dims); + } +}; + +class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) The input labels of LabelSmooth operator. This " + "input can be batched labels in one-hot encoding or output from " + "softmax, with shape [N x K], where N is the batch size and K is " + "the number of classes"); + AddInput("PriorDist", + "(Tensor, optional)" + "The prior distribution to be added to the smoothed label. It is " + "fixed during training and the number of elements should be equal " + "to the dimension K of each label. Default is uniform " + "distribution and each element will be set to 1/K if not provided " + "in input.") + .AsDispensable(); + AddOutput("Out", + "(loDTensor) The smoothed label of LabelSmooth operator. It has" + "the same shape and LoD with the Input(LoDTensor)."); + AddAttr("epsilon", + "(float, default 0.0f)" + "The smoothing parameter of LabelSmooth operator.") + .SetDefault(0.0f); + AddComment(R"DOC( +LabelSmooth Operator. + +Label smoothing is a mechanism to regularize the classifier layer. In machine +learning, optimizing the log-likelihood of the correct label directly may +cause two problems. First, it may result in overfitting: if the model learns +to assign full probability to the ground-truth label for each training example, +it is not guaranteed to generalize. Second, it encourages the differences +between the largest logit and all others to become large, reducing the ability +of the model to adapt. Label smoothing is proposed to encourage the model to +be less confident, which replaces the ground-truth label $y$ with the weighted +sum of itself and some fixed distribution $\mu$, i.e. + +$$ + \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu, +$$ + +where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and +$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for +$\mu$. This change in the ground-truth label is called label-smoothing +regularization or LSR. + +See more details about label smoothing in https://arxiv.org/abs/1512.00567. + +)DOC"); + } +}; + +class LabelSmoothGradOp : public framework::OperatorWithKernel { + public: + LabelSmoothGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, + label_smooth_grad, ops::LabelSmoothGradOp); +REGISTER_OP_CPU_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CPU_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a40a4e9ec82199afae3ae77bb2296a2fa95b0a5 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/label_smooth_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + label_smooth, + ops::LabelSmoothKernel, + ops::LabelSmoothKernel); +REGISTER_OP_CUDA_KERNEL( + label_smooth_grad, + ops::LabelSmoothGradKernel, + ops::LabelSmoothGradKernel); diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15752377f663fcb526b8306158e1e90d743c6cb6 --- /dev/null +++ b/paddle/fluid/operators/label_smooth_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LabelSmoothKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* in_t = ctx.Input("X"); + auto* dist_t = ctx.Input("PriorDist"); + auto label_dim = in_t->dims()[1]; + out_t->mutable_data(ctx.GetPlace()); + + auto epsilon = ctx.Attr("epsilon"); + auto out = framework::EigenVector::Flatten(*out_t); + auto in = framework::EigenVector::Flatten(*in_t); + auto& dev = *ctx.template device_context().eigen_device(); + if (dist_t) { + auto dist = framework::EigenVector::Flatten(*dist_t); + out.device(dev) = + static_cast(1 - epsilon) * in + + epsilon * dist.broadcast(Eigen::DSizes(in_t->numel())); + } else { + out.device(dev) = static_cast(1 - epsilon) * in + + static_cast(epsilon / label_dim); + } + } +}; + +template +class LabelSmoothGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* d_in_t = ctx.Output(framework::GradVarName("X")); + d_in_t->mutable_data(ctx.GetPlace()); + + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto d_in = framework::EigenVector::Flatten(*d_in_t); + + auto epsilon = ctx.Attr("epsilon"); + auto& dev = *ctx.template device_context().eigen_device(); + d_in.device(dev) = static_cast(1 - epsilon) * d_out; + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..60e37ed01b3cad428dc0184634b4d36c9f24f9c5 --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/layer_norm_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +class LayerNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mean"), + "Output(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Variance"), + "Output(Variance) of LayerNormOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); + PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), + "'begin_norm_axis' must be less than the rank of X."); + + auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + if (ctx->HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); + } + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); + } + + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); + ctx->SetOutputDim("Mean", {left}); + ctx->SetOutputDim("Variance", {left}); + ctx->ShareLoD("X", "Y"); + } +}; + +class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The input tensor."); + AddInput("Scale", + "(Tensor, optional) Scale is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddInput("Bias", + "(Tensor, optional) Bias is a 1-dimensional tensor of size " + "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])." + "It is applied to the output.") + .AsDispensable(); + AddOutput("Y", "(LoDTensor) Result after normalization."); + AddOutput("Mean", "(Tensor) Mean of the current mini batch.") + .AsIntermediate(); + AddOutput("Variance", "(Tensor) Variance of the current mini batch.") + .AsIntermediate(); + + AddAttr("epsilon", + "(float, default 1e-5) Constant for " + "numerical stability") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("begin_norm_axis", + "(int default:1), the " + "axis of `begin_norm_axis ... Rank(X) - 1` will be " + "normalized. `begin_norm_axis` splits the tensor(`X`) to a " + "matrix [N,H].") + .SetDefault(1) + .AddCustomChecker([](const int &begin_norm_axis) { + PADDLE_ENFORCE_GT(begin_norm_axis, 0, + "'begin_norm_axis' should be greater than zero."); + }); + + AddComment(R"DOC( +Layer Normalization. +Layer Norm has been implemented as discussed in the paper: +https://arxiv.org/abs/1607.06450 +... +)DOC"); + } +}; + +class LayerNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of LayerNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) of LayerNormOp should not be null."); + + // check output + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + return framework::OpKernelType(framework::ToDataType(t->type()), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, + layer_norm_grad, ops::LayerNormGradOp); +REGISTER_OP_CPU_KERNEL( + layer_norm, ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CPU_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..aa54fd54155ce19298ad9f80c930ad08e542d71c --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/layer_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..60c0b07add172520dc8062d3d5e8e4e69758e1f1 --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op.h @@ -0,0 +1,238 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +class LayerNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto x = *ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x.dims(); + + y->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + x.Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto &dev_ctx = ctx.template device_context(); + math::RowwiseMean row_mean; + + // get mean + row_mean(dev_ctx, x, mean); + + // get variance + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); + row_mean(dev_ctx, out, var); + + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &out); + + if (scale) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); + } + if (bias) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); + } + } +}; + +template +class LayerNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto x = *ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto d_y = *ctx.Input(framework::GradVarName("Y")); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + const auto &x_dims = x.dims(); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + auto &dev_ctx = ctx.template device_context(); + math::ColwiseSum colwise_sum; + + Tensor temp; + Tensor temp_norm; + if (d_scale || d_x) { + x.Resize(matrix_shape); + temp.mutable_data(matrix_shape, ctx.GetPlace()); + + if (!(bias && scale)) { + temp_norm.ShareDataWith(*y); + temp_norm.Resize(matrix_shape); + } else { + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); + } + } + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + framework::DDim vec_shape({left}); + d_x->mutable_data(ctx.GetPlace()); + auto dx_dim = d_x->dims(); + Tensor temp_vec; + temp_vec.mutable_data(vec_shape, ctx.GetPlace()); + + math::RowwiseMean row_mean; + + if (d_scale) { + // dy_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); + framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } else { + // dy_dx + framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), d_x); + d_x->Resize(dx_dim); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e1dfa494872b6f187ee0b3cca399308a1cab42a --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -0,0 +1,269 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/linear_chain_crf_op.h" + +namespace paddle { +namespace operators { + +class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Emission", + "(LoDTensor, default LoDTensor) " + "A 2-D LoDTensor with shape [N x D], where N is the size of the " + "mini-batch and D is the total tag number. The unscaled emission " + "weight matrix for the linear chain CRF. "); + AddInput("Transition", + "(Tensor, default Tensor) A 2-D Tensor with shape " + "[(D + 2) x D]. The learnable parameter for the linear_chain_crf " + "operator. See more details in the operator's comments."); + AddInput("Label", + "(LoDTensor, default LoDTensor) A LoDTensor with shape " + "[N x 1], where N is the total element number in a mini-batch. " + "The ground truth."); + AddOutput( + "Alpha", + "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "The forward vectors for the entire batch. Denote it as $\alpha$. " + "$\alpha$ is a memo table used to calculate the normalization " + "factor in CRF. $\alpha[k, v]$ stores the unnormalized " + "probabilites of all possible unfinished sequences of tags that end at " + "position $k$ with tag $v$. For each $k$, " + "$\alpha[k, v]$ is a vector of length $D$ with a component for " + "each tag value $v$. This vector is called a forward vecotr and " + "will also be used in backward computations.") + .AsIntermediate(); + AddOutput( + "EmissionExps", + "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "The exponentials of Input(Emission). This is an intermediate " + "computational result in forward computation, and will be reused in " + "backward computation.") + .AsIntermediate(); + AddOutput( + "TransitionExps", + "(Tensor, default Tensor) A 2-D Tensor with shape " + "[(D + 2) x D]. The exponentials of Input(Transition). This is an " + "intermediate computational result in forward computation, and " + "will be reused in backward computation.") + .AsIntermediate(); + AddOutput( + "LogLikelihood", + "(Tensor, default Tensor) The logarithm of the conditional " + "likelihood of each training sample in a mini-batch. This is a 2-D " + "tensor with shape [S x 1], where S is the sequence number in a " + "mini-batch. Note: S is equal to the sequence number in a mini-batch. " + "The output is no longer a LoDTensor."); + AddComment(R"DOC( +LinearChainCRF Operator. + +Conditional Random Field defines an undirected probabilistic graph with nodes +denoting random variables and edges denoting dependencies between these +variables. CRF learns the conditional probability $P(Y|X)$, where +$X = (x_1, x_2, ... , x_n)$ are structured inputs and +$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs. + +Linear chain CRF is a special case of CRF that is useful for sequence labeling +task. Sequence labeling tasks do not assume a lot of conditional +independences among inputs. The only constraint they impose is that the input +and output must be linear sequences. Thus, the graph of such a CRF is a simple +chain or a line, which results in the linear chain CRF. + +This operator implements the Forward-Backward algorithm for the linear chain +CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and +http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details. + +Equation: +1. Denote Input(Emission) to this operator as $x$ here. +2. The first D values of Input(Transition) to this operator are for starting +weights, denoted as $a$ here. +3. The next D values of Input(Transition) of this operator are for ending +weights, denoted as $b$ here. +4. The remaning values of Input(Transition) are for transition weights, +denoted as $w$ here. +5. Denote Input(Label) as $s$ here. + +The probability of a sequence $s$ of length $L$ is defined as: +$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L} + + \sum_{l=1}^L x_{s_l} + + \sum_{l=2}^L w_{s_{l-1},s_l})$$ + +where $Z$ is a normalization value so that the sum of $P(s)$ over +all possible sequences is 1, and $x$ is the emission feature weight +to the linear chain CRF. + +Finally, the linear chain CRF operator outputs the logarithm of the conditional +likelihood of each training sample in a mini-batch. + +NOTE: +1. The feature function for a CRF is made up of the emission features and the +transition features. The emission feature weights are NOT computed in +this operator. They MUST be computed first before this operator is called. + +2. Because this operator performs global normalization over all possible +sequences internally, it expects UNSCALED emission feature weights. +Please do not call this op with the emission feature being output of any +nonlinear activation. + +3. The 2nd dimension of Input(Emission) MUST be equal to the tag number. + +)DOC"); + } +}; + +class LinearChainCRFOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Emission"), + "Input(Emission) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Transition"), + "Input(Transition) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Alpha"), + "Output(Alpha) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"), + "Output(EmissionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"), + "Output(TransitionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"), + "Output(LogLikelihood) should be not null."); + + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, + "The Input(Transition) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + + ctx->SetOutputDim("Alpha", emission_dims); + ctx->SetOutputDim("EmissionExps", emission_dims); + ctx->SetOutputDim("TransitionExps", transition_dims); + // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood) + // is the sequence number in a mini-batch. The dimension set here should be + // resized to its correct size in the function Compute. Fix this once we can + // get LoD information in the InferShape interface. + ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1}); + } + + protected: + // Explicitly set that the data type of computation kernel of linear_chain_crf + // is determined by its input "Emission". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Emission")->type()), + platform::CPUPlace()); + } +}; + +class LinearChainCRFGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("EmissionExps"), + "Input(EmissionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("TransitionExps"), + "Input(TransitionExps) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")), + "Input(LogLikelihood@GRAD) shoudl be not null."); + + auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL, + "The Input(EmissionExps) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_exps_dims[0], + "An empty mini-batch is not allowed."); + + auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); + PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL, + "The Input(TransitionExps) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_exps_dims[0] - 2, transition_exps_dims[1], + "An invalid dimension for the Input(TransitionExps), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + PADDLE_ENFORCE_EQ( + emission_exps_dims[1], transition_exps_dims[1], + "The 2nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_exps_dims[0], label_dims[0], + "The height of Input(EmissionExps) and the height of Input(Label) " + "should be the same."); + + if (ctx->HasOutput(framework::GradVarName("Emission"))) { + ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); + } + if (ctx->HasOutput(framework::GradVarName("Transition"))) { + ctx->SetOutputDim(framework::GradVarName("Transition"), + transition_exps_dims); + } + } + + protected: + // Explicitly set that the data type of output of the linear_chain_crf_grad + // operator is determined by its input: gradients of LogLikelihood. + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("LogLikelihood")) + ->type()), + platform::CPUPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker, + linear_chain_crf_grad, ops::LinearChainCRFGradOp); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf, + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf_grad, + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cu b/paddle/fluid/operators/linear_chain_crf_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e04e76eebc71def814fed65a469ef5f9f1b16b0 --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/linear_chain_crf_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + linear_chain_crf, + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CUDA_KERNEL( + linear_chain_crf_grad, + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15b64c09bf366b356683c47c82a6dbf9529d9b58 --- /dev/null +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -0,0 +1,353 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +static inline T NormalizeL1(T* x, size_t len) { + T sum = 0.; + for (size_t i = 0; i < len; ++i) sum += x[i]; + // (This comment is from the old LinearChainCRFLayer.) + // Right now, we just bet that sum won't be zero. If this really happens, we + // will figure out what should be done then. + PADDLE_ENFORCE(sum, + "The unnormalized probabilities of all possible unfinished " + "sequences must be greater than 0."); + T s = 1. / sum; + for (size_t i = 0; i < len; ++i) x[i] *= s; + return sum; +} + +template +struct ScalarMul { + explicit ScalarMul(const T& scalar) : scalar(scalar) {} + T operator()(const T& val) const { return val * scalar; } + + T scalar; +}; + +using framework::LoDTensor; +using framework::LoD; +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class LinearChainCRFOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(caoying) The checks related to LoD information should be + // moved into InferShape once after the InferShape is refactored. + PADDLE_ENFORCE_EQ(ctx.Input("Emission")->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + PADDLE_ENFORCE_EQ(ctx.Input("Label")->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + auto in_lod = ctx.Input("Label")->lod(); + PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence."); + const size_t level = 0; + const size_t seq_num = in_lod[level].size() - 1; + + const LoDTensor* emission_weights = ctx.Input("Emission"); + const Tensor* transition_weights = ctx.Input("Transition"); + const LoDTensor* label = ctx.Input("Label"); + + Tensor* emission_exps = ctx.Output("EmissionExps"); + Tensor* transition_exps = ctx.Output("TransitionExps"); + Tensor* alpha = ctx.Output("Alpha"); + Tensor* ll = ctx.Output("LogLikelihood"); + + // Because the computation codes only runs on CPU, here the memory for all + // the outputs is FIXED to be allocated on the CPU memory. + emission_exps->mutable_data(platform::CPUPlace()); + transition_exps->mutable_data(platform::CPUPlace()); + alpha->mutable_data(platform::CPUPlace()); + + // Resize the output tensor to its correct dimension. + ll->Resize({static_cast(seq_num), 1}); + ll->mutable_data(platform::CPUPlace()); + + // Now, all the inputs and outputs should be on the CPU memory. + auto emission_dims = emission_weights->dims(); + const size_t batch_size = emission_dims[0]; + const size_t tag_num = emission_dims[1]; + + Tensor emission_row_max; + emission_row_max.mutable_data( + framework::make_ddim({static_cast(batch_size), 1}), + platform::CPUPlace()); + + auto& place = *ctx.template device_context() + .eigen_device(); + auto x = EigenMatrix::From(*emission_weights); + auto x_row_max = EigenMatrix::From(emission_row_max); + x_row_max.device(place) = + x.maximum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(int(batch_size), 1)); + + auto x_exps = EigenMatrix::From(*emission_exps); + x_exps.device(place) = + (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); + + auto w = EigenMatrix::From(*transition_weights); + auto w_exps = EigenMatrix::From(*transition_exps); + w_exps.device(place) = w.exp(); + + T* log_likelihood = ll->data(); + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = static_cast(in_lod[level][i]); + int end_pos = static_cast(in_lod[level][i + 1]); + if (end_pos == start_pos) { + // If an empty input sequence is given, pad 0 for its cost. + log_likelihood[i] = 0.; + continue; + } + + const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); + Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + + log_likelihood[i] = ForwardOneSequence( + one_seq, one_seq_row_max, one_seq_exps, *transition_weights, + *transition_exps, one_seq_label, &one_seq_alpha); + } + }; + + private: + T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max, + const Tensor& emission_exps, const Tensor& trans_weights, + const Tensor& trans_weight_exps, const Tensor& label, + Tensor* alpha) const { + const T* x = emission.data(); + const T* x_row_max = emission_row_max.data(); + const T* x_exps = emission_exps.data(); + const T* w = trans_weights.data(); + const T* w_exps = trans_weight_exps.data(); + T* alpha_value = alpha->data(); + + auto x_dims = emission.dims(); + const size_t seq_length = x_dims[0]; + const size_t tag_num = x_dims[1]; + // The 1st row of w are transition weights for start mask. + // The 2nd row of w are transition weights for end mask. + // Transition weights between other tags begin from the 3rd row of w. + const size_t state_trans_base_idx = 2; + + for (size_t i = 0; i < tag_num; ++i) { + alpha_value[i] = w_exps[i] * x_exps[i]; + } + T ll = -x_row_max[0] - std::log(NormalizeL1(alpha_value, tag_num)); + + for (size_t k = 1; k < seq_length; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T sum = 0.; + for (size_t j = 0; j < tag_num; ++j) { + sum += alpha_value[(k - 1) * tag_num + j] * // (*) + w_exps[(j + state_trans_base_idx) * tag_num + i]; + } + alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum; + } + // NormalizeL1 is to avoid underflow or overflow at (*). + ll -= x_row_max[k] + + std::log(NormalizeL1(alpha_value + k * tag_num, tag_num)); + } + T sum = 0.; + for (size_t i = 0; i < tag_num; ++i) { + sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i]; + } + ll -= std::log(sum); + // Now ll is equal to -log(Z). + + const int64_t* lbl = label.data(); + PADDLE_ENFORCE_LT( + static_cast(*std::max_element(lbl, lbl + seq_length)), tag_num, + "An invalid tag label that execesses the largest tag number."); + + // Calculate the nominator part, which depends on the label sequence. + ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] + + w[tag_num + lbl[seq_length - 1]] /*end transition*/; + for (size_t k = 1; k < seq_length; ++k) { + ll += x[k * tag_num + lbl[k]] + + w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]]; + } + return -ll; + } +}; + +template +class LinearChainCRFGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const size_t level = 0; // currently, only support sequence. + auto lod = ctx.Input("Label")->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence."); + + const Tensor* label = ctx.Input("Label"); + const Tensor* emission_exps = ctx.Input("EmissionExps"); + const Tensor* transition_exps = ctx.Input("TransitionExps"); + const Tensor* alpha = ctx.Input("Alpha"); + const T* ll_grad = + ctx.Input(framework::GradVarName("LogLikelihood"))->data(); + + Tensor* emission_grad = + ctx.Output(framework::GradVarName("Emission")); + Tensor* transition_grad = + ctx.Output(framework::GradVarName("Transition")); + + // TODO(caoying) Fix this constraint. When the Input(Emission) is from the + // data reader operator, it can have no gradients. + PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null."); + emission_grad->mutable_data(platform::CPUPlace()); + if (transition_grad) { + transition_grad->mutable_data(platform::CPUPlace()); + math::set_constant(ctx.device_context(), transition_grad, 0.); + } + // Now, all the inputs and outputs should be on the CPU memory. + + auto emission_dims = emission_exps->dims(); + // Beta is the memo table used in dynamic programming to calculate the + // backwark vectors. For a backward vector i (the i-th row of beta), it + // captures the unnormalized probabilities of partial sequences starting + // at position i. + Tensor beta; + beta.mutable_data(emission_dims, platform::CPUPlace()); + + for (size_t i = 0; i < lod[level].size() - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + if (end_pos == start_pos) continue; + + const Tensor one_seq_emission_exps = + emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + Tensor one_seq_beta = beta.Slice(start_pos, end_pos); + Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); + + BackwardOneSequence( + ctx.template device_context(), ll_grad[i], + one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, + &one_seq_beta, transition_grad, &one_seq_emission_grad); + } + }; + + private: + void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + const T ll_grad, const Tensor& emission_exps, + const Tensor& transition_exps, const Tensor& alpha, + const Tensor& label, Tensor* beta, + Tensor* transition_grad, + Tensor* emission_grad) const { + const T* w_exps = transition_exps.data(); + const T* x_exps = emission_exps.data(); + const int64_t* label_value = label.data(); + T* beta_value = beta->data(); + + auto x_dims = emission_exps.dims(); + const size_t seq_length = x_dims[0]; + const size_t tag_num = x_dims[1]; + const size_t state_trans_base_idx = 2; + + // Calculate the backward vectors: beta. + // First, calculate the initialition state. + for (size_t i = 0; i < tag_num; ++i) { + beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i]; + } + NormalizeL1(beta_value + (seq_length - 1) * tag_num, tag_num); + for (int k = static_cast(seq_length) - 2; k >= 0; --k) { + for (size_t i = 0; i < tag_num; ++i) { + T sum = 0.; + for (size_t j = 0; j < tag_num; ++j) { + sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * // (**) + x_exps[(k + 1) * tag_num + j] * + beta_value[(k + 1) * tag_num + j]; + } + beta_value[k * tag_num + i] = sum; + } + // NormalizeL1 is to avoid underflow or overflow at (**). + NormalizeL1(beta_value + k * tag_num, tag_num); + } + + auto x_grad_mat = EigenMatrix::From(*emission_grad); + auto alpha_mat = EigenMatrix::From(alpha); + auto beta_mat = EigenMatrix::From(*beta); + + auto* place = ctx.eigen_device(); + auto prob = alpha_mat * beta_mat; + auto row_sum = prob.sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(seq_length, 1)) + .broadcast(Eigen::DSizes(1, tag_num)); + x_grad_mat.device(*place) = + (prob / row_sum).unaryExpr(ScalarMul(ll_grad)); + + for (size_t k = 0; k < seq_length; ++k) { + x_grad_mat(k, label_value[k]) -= static_cast(ll_grad); + } + + if (transition_grad) { + T* trans_grad = transition_grad->data(); + for (size_t k = 0; k < tag_num; ++k) { + // Do not multiply by the output gradient here, because x_grad_mat has + // alrealy done this. + trans_grad[k] += x_grad_mat(/*from start state*/ 0, k); + trans_grad[tag_num + k] += + x_grad_mat(/*to end state*/ seq_length - 1, k); + } + + auto x_exps_mat = EigenMatrix::From(emission_exps); + + // TODO(caoying): Fix this to avoid using this local variable if we can + // profile the training process. + Tensor tmp; + tmp.mutable_data(beta->dims(), platform::CPUPlace()); + auto tmp_mat = EigenMatrix::From(tmp); + auto prob = beta_mat * x_exps_mat; + auto row_sum = prob.sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(seq_length, 1)) + .broadcast(Eigen::DSizes(1, tag_num)); + tmp_mat.device(*place) = prob / row_sum; + + for (size_t k = 1; k < seq_length; ++k) { + T sum = 0.; + for (size_t i = 0; i < tag_num; ++i) { + for (size_t j = 0; j < tag_num; ++j) { + sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * // (**) + alpha_mat(k - 1, i) * tmp_mat(k, j); + } + } + sum = 1. / sum; + for (size_t i = 0; i < tag_num; ++i) { + for (size_t j = 0; j < tag_num; ++j) { + trans_grad[(i + state_trans_base_idx) * tag_num + j] += + sum * w_exps[(i + state_trans_base_idx) * tag_num + j] * + alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad; + } + } + trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num + + label_value[k]] -= static_cast(ll_grad); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e5a17f21676331fb3c78d16d91c39ce48cc11c3 --- /dev/null +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" +#include "paddle/string/printf.h" + +namespace paddle { +namespace operators { + +constexpr char kOptimizeBlock[] = "OptimizeBlock"; + +void RunServer(std::shared_ptr service) { + service->RunSyncUpdate(); + VLOG(4) << "RunServer thread end"; +} + +static void CreateTensorFromMessageType(framework::Variable *var, + sendrecv::VarType var_type) { + if (var_type == sendrecv::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == sendrecv::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else { + PADDLE_THROW( + "VariableMessage type %d is not in " + "[LoDTensor, SelectedRows]", + var_type); + } +} + +class ListenAndServOp : public framework::OperatorBase { + public: + ListenAndServOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) { + if (!rpc_service_) { + std::string endpoint = Attr("endpoint"); + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + server_thread_.reset(new std::thread(RunServer, rpc_service_)); + } + } + + void Stop() override { + detail::MessageWithName term_msg; + term_msg.first = LISTEN_TERMINATE_MESSAGE; + rpc_service_->Push(term_msg); + rpc_service_->ShutDown(); + server_thread_->join(); + } + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Scope &recv_scope = scope.NewScope(); + + // FIXME(Yancey1989): initialize rpc server with lazy mode. + rpc_service_->SetScope(&recv_scope); + rpc_service_->SetDevCtx(&dev_ctx); + auto ins = Inputs("X"); + auto fan_in = ins.size(); + + auto *block = Attr(kOptimizeBlock); + auto *program = block->Program(); + framework::Executor executor(dev_place); + + // TODO(typhoonzero): change this to a while_op for every cluster-batch. + bool exit_flag = false; + while (!exit_flag) { + // Get from multiple trainers, we don't care about the order in which + // the gradients arrives, just add suffix 0~n and merge the gradient. + rpc_service_->SetCond(0); + size_t recv_var_cnt = 0; + int batch_barrier = 0; + while (batch_barrier != fan_in) { + const detail::MessageWithName &v = rpc_service_->Get(); + auto recv_var_name = v.first; + if (recv_var_name == LISTEN_TERMINATE_MESSAGE) { + LOG(INFO) << "received terminate message and exit"; + exit_flag = true; + break; + } else if (recv_var_name == BATCH_BARRIER_MESSAGE) { + VLOG(3) << "recv batch barrier message"; + batch_barrier++; + continue; + } else { + VLOG(3) << "received grad: " << recv_var_name; + recv_var_cnt++; + auto *var = recv_scope.FindVar(recv_var_name); + if (var == nullptr) { + LOG(ERROR) << "Can not find server side var: " << recv_var_name; + PADDLE_THROW("Can not find server side var"); + } + detail::DeserializeFromMessage(v.second, dev_ctx, var); + } + } + VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier."; + // TODO(Yancey1989): merge SelectedRows variables here + if (exit_flag) { + rpc_service_->ShutDown(); + } + + try { + executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ + false /*create_local_scope*/, false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + rpc_service_->SetCond(1); + rpc_service_->WaitClientGet(recv_var_cnt); + grads_counter_.clear(); + } // while(true) + } + + protected: + std::shared_ptr rpc_service_; + std::shared_ptr server_thread_; + mutable std::unordered_map grads_counter_; +}; + +class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable(); + AddComment(R"DOC( +ListenAndServ operator + +This operator will start a RPC server which can receive variables +from send_op and send back variables to recv_op. +)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr(kOptimizeBlock, + "BlockID to run on server side."); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp, + ops::ListenAndServOpMaker); diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1948063d886b79964b1a52d9d82a8e7d2fb0d493 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class LoadCombineOp : public framework::OperatorBase { + public: + LoadCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + + auto out_var_names = Outputs("Out"); + PADDLE_ENFORCE_GT( + static_cast(out_var_names.size()), 0, + "The number of output variables should be greater than 0."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < out_var_names.size(); i++) { + auto *out_var = scope.FindVar(out_var_names[i]); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_names[i]); + + auto *tensor = out_var->GetMutable(); + + // Error checking + PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", + filename); + + // Get data from fin to tensor + DeserializeFromStream(fin, tensor, dev_ctx); + + if (platform::is_gpu_place(place)) { + // copy CPU to GPU + framework::LoDTensor cpu_tensor; + cpu_tensor.ShareDataWith(*tensor); + cpu_tensor.set_lod(tensor->lod()); + + // reset tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(cpu_tensor.lod()); + Copy(cpu_tensor, place, dev_ctx, tensor); + } + } + } +}; + +class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput( + "Out", + "(vector) The output LoDTensors that will be read from the input file.") + .AsDuplicable(); + AddAttr("file_path", + "(string) " + "LoDTensors will be loaded from \"file_path\".") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +LoadCombine Operator. + +LoadCombine operator loads LoDTensor variables from a file. The file should +contain one or more LoDTensors serialized using the SaveCombine operator. The +LoadCombine operator applies a deserialization strategy to appropriately load +the LodTensors, and this strategy complements the serialization strategy used +in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled +with the SaveCombine operator, and can only deserialize one or more LoDTensors +that were saved using the SaveCombine operator. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, + ops::LoadCombineOpProtoMaker); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9bf5d72b234f96d9eb5a4c275737ac8c18cd63d --- /dev/null +++ b/paddle/fluid/operators/load_op.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class LoadOp : public framework::OperatorBase { + public: + LoadOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + filename); + + auto out_var_name = Output("Out"); + auto *out_var = scope.FindVar(out_var_name); + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", + out_var_name); + + auto *tensor = out_var->GetMutable(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + DeserializeFromStream(fin, tensor, dev_ctx); + + if (platform::is_gpu_place(place)) { + // copy CPU to GPU + framework::LoDTensor cpu_tensor; + cpu_tensor.ShareDataWith(*tensor); + cpu_tensor.set_lod(tensor->lod()); + + // reset tensor + out_var->Clear(); + tensor = out_var->GetMutable(); + tensor->set_lod(cpu_tensor.lod()); + Copy(cpu_tensor, place, dev_ctx, tensor); + } + } +}; + +class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddAttr("file_path", + "(string) " + "Variable will be loaded from \"file_path\".") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +Load Operator. + +Load operator will load a tensor variable from disk file. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f11f5a89f5ad5b2f3deed905625aefa1e9d9935b --- /dev/null +++ b/paddle/fluid/operators/lod_array_length_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class LoDArrayLengthOp : public framework::OperatorBase { + public: + LoDArrayLengthOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &out = + *scope.FindVar(Output("Out"))->GetMutable(); + out.Resize({1}); + auto cpu = platform::CPUPlace(); + *out.mutable_data(cpu) = static_cast(x.size()); + } +}; + +class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensorArray) The input tensor array."); + AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t"); + AddComment(R"DOC( +LoDArrayLength Operator. + +This operator obtains the length of lod tensor array: + +$$Out = len(X)$$ + +NOTE: The output is a CPU Tensor since the control variable should be only in +CPU and the length of LoDTensorArray should be used as control variables. + +)DOC"); + } +}; + +class LoDArrayLengthInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasOutput("Out")); + context->SetOutputDim("Out", {1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp, + ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b9426a9f8f0b0b3082667dc7a1414aceb824aca --- /dev/null +++ b/paddle/fluid/operators/lod_rank_table_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace operators { + +class LoDRankTableOp : public framework::OperatorBase { + public: + LoDRankTableOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto x = scope.FindVar(Input("X"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + VLOG(10) << "Level = " << static_cast(Attr("level")); + out->Reset(x.lod(), static_cast(Attr("level"))); + VLOG(10) << Input("X") << "'s lod information is " << *out; + } +}; + +class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) input lod tensor, must contain lod information."); + AddOutput("Out", "(LoDRankTable) The rank table of specific level."); + AddAttr("level", "(int) the specific lod level to rank.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment(R"DOC(Create LoDRanTable by LoDTensor + +LoD Rank Table stores the `level` of `lod` which is ordered by sequence +length in descending order. It is useful when implement dynamic RNN and is +shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice +output operators. +)DOC"); + } +}; + +class LoDRankTableInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X"); + } +}; + +class LoDRankTableInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("Out")) { + block->FindRecursiveOrCreateVar(o).SetType( + framework::proto::VarDesc::LOD_RANK_TABLE); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp, + paddle::operators::LoDRankTableOpProtoMaker, + paddle::operators::LoDRankTableInferShape, + paddle::operators::LoDRankTableInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..55ae71c1815470925b2bb153fc647b331dcc9ba4 --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lod_reset_op.h" + +namespace paddle { +namespace operators { + +class LoDResetOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of LoDResetOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LoDResetOp should not be null."); + // If target LoD is not set form Input(), then it must be set from Attr(). + if (!ctx->HasInput("TargetLoD")) { + auto level0 = ctx->Attrs().Get>("target_lod"); + PADDLE_ENFORCE(level0.size() > 1, + "Target LoD is not found, should be set to be a valid one " + "through Input() or Attr()."); + } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The input tensor of lod_reset operator."); + AddInput("TargetLoD", + "(Tensor, optional) The target level 0 LoD from Input().") + .AsDispensable(); + AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator."); + AddAttr>("target_lod", + "The target level 0 LoD from Attr().") + .SetDefault(std::vector{}); + AddComment(R"DOC(LoDReset operator + +Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or +Attr(target_lod), or set LoD for Input(X) if it doesn't have one. +Currently the lod_reset operator only supports the reset of level 0 LoD. +At least one of Input(TargetLoD) and Attr(target_lod) must be set, +and if both of them are set, Input(TargetLoD) will be chosen as the +target LoD. + +An example: +Given a float LoDTensor X with shape (6, 1), its transpose form represents + + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + +with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like + + [1.0, 2.0], [3.0, 4.0, 5.0], [6.0]. + +If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and +the sequences that the LoDTensor Output(Out) contains becomes: + + [1.0, 2.0, 3.0, 4.0], [5.0, 6.0]. + +)DOC"); + } +}; + +class LoDResetGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad, + ops::LoDResetGradOp); +REGISTER_OP_CPU_KERNEL(lod_reset, + ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CPU_KERNEL( + lod_reset_grad, ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8bfc8bd3bf06037d7fcd387dee0514a1e4c6a0f9 --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lod_reset_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + lod_reset, ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a10efee0bdd8c58d23c05bb85f0f882d801848fe --- /dev/null +++ b/paddle/fluid/operators/lod_reset_op.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LoDResetKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out = ctx.Output("Out"); + auto* in = ctx.Input("X"); + auto* lod_t = ctx.Input("TargetLoD"); + + std::vector level0; + if (lod_t) { + auto* lod = lod_t->data(); + if (platform::is_gpu_place(ctx.GetPlace())) { + framework::Tensor lod_cpu; + framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(), + &lod_cpu); + lod = lod_cpu.data(); + } + level0 = std::vector(lod, lod + lod_t->numel()); + } else { + level0 = ctx.Attr>("target_lod"); + } + + PADDLE_ENFORCE(level0.size() > 1UL, + "The size of target LoD should be greater than 1."); + PADDLE_ENFORCE(level0[0] == 0, + "Target LoD should be a vector starting from 0."); + PADDLE_ENFORCE(level0.back() == in->dims()[0], + "Target LoD should be a vector end with the " + "first dimension of Input(X)."); + for (size_t i = 0; i < level0.size() - 1; ++i) { + PADDLE_ENFORCE(level0[i + 1] > level0[i], + "Target LoD should be an ascending vector."); + } + + out->ShareDataWith(*in); + // cast level0 to size_t + std::vector ulevel0(level0.size(), 0); + std::transform(level0.begin(), level0.end(), ulevel0.begin(), + [](int a) { return static_cast(a); }); + framework::LoD target_lod; + target_lod.push_back(ulevel0); + out->set_lod(target_lod); + } +}; + +template +class LoDResetGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + + d_x->ShareDataWith(*d_out); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..edc32bcec1441e50e24612789727db9a044cde54 --- /dev/null +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct CopyRange { + size_t begin; + size_t end; +}; + +class LoDTensorToArrayOp : public framework::OperatorBase { + public: + LoDTensorToArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s", + Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable"))) + .Get(); + auto &out = *detail::Ref(scope.FindVar(Output("Out"))) + .GetMutable(); + auto &items = rank_table.items(); + auto max_seq_len = items[0].length; + auto rank_level = rank_table.level(); + + PADDLE_ENFORCE_LT(rank_level, x.lod().size(), + "Input should be a LOD tensor, and size is at least %d", + rank_level + 1); + out.resize(max_seq_len); + std::vector> copy_ranges(max_seq_len); + + // set out[i] lod + for (size_t t = 0; t < max_seq_len; t++) { + auto &lod = *out[t].mutable_lod(); + lod.clear(); + for (auto &item : items) { + if (t >= item.length) { + break; + } + size_t start_idx = x.lod()[rank_level][item.index] + t; + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x.lod(), start_idx, start_idx + 1, rank_level + 1); + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(&lod, lod_length); + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); + } + } + for (size_t i = 0; i < max_seq_len; ++i) { + auto &ranges = copy_ranges[i]; + size_t height = std::accumulate( + ranges.begin(), ranges.end(), 0UL, + [](size_t a, const CopyRange &b) { return a + b.end - b.begin; }); + auto x_dim = x.dims(); + x_dim[0] = static_cast(height); + out[i].Resize(x_dim); + out[i].mutable_data(x.place(), x.type()); + size_t offset = 0; + for (auto &each_range : ranges) { + size_t len = each_range.end - each_range.begin; + if (len == 0) { + continue; + } + // out[i][offset: offset+len] = x[each_range.begin: each_range.end] + auto slice = out[i].Slice(static_cast(offset), + static_cast(offset + len)); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::Copy(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); + offset += len; + } + } + } +}; + +class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("RankTable", ""); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class LoDTensorToArrayInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of LoDTensorToArrayOp should not be null."); + PADDLE_ENFORCE( + context->HasInput("RankTable"), + "Input(RankTable) of LoDTensorToArrayOp should not be null."); + + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of LoDTensorToArrayOp should not be null."); + + auto x_dim = context->GetInputDim("X"); + // The first dim of each LoDTensor in Output can only be set at run-time.; + // We still have to Resize each LoDTensor in Output. + context->SetOutputDim("Out", x_dim); + } +}; + +class LoDTensorToArrayInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); + } + } +}; + +class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("array_to_lod_tensor"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("RankTable", Input("RankTable")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp, + ops::LoDTensorToArrayOpProtoMaker, + ops::LoDTensorToArrayInferShape, + ops::LoDTensorToArrayInferVarType, + ops::LoDTensorToArrayGradMaker); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c5cd2956811329a1ac5da9e42e808c2684dc771 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/log_loss_op.h" + +namespace paddle { +namespace operators { + +class LogLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Predicted) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Predicted) contains a real value, " + "so the 2nd dimension of Input(X) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Predicted", "Loss"); + } +}; + +template +class LogLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Predicted", + "The input value (Predicted) of Log loss op." + "Predicted is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Log loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the log loss."); + AddAttr("epsilon", "Epsilon in log loss."); + AddComment(R"DOC( +LogLoss Operator. + +Log loss is a loss function used for binary classification. Log Loss quantifies +the accuracy of a classifier by penalising false classifications. Minimising the +Log Loss is equivalent to maximising the accuracy of the classifier. We define +Predicted as the values predicted by our model and Labels as the target ground +truth value. Log loss can evaluate how close the predicted values are to the +target. The shapes of Predicted and Labels are both [batch_size, 1]. +The equation is: + +$$ +Loss = - Labels * log(Predicted + \epsilon) - + (1 - Labels) * log(1 - Predicted + \epsilon) +$$ + +)DOC"); + } +}; + +class LogLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")), + "Output(Predicted@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Predicted"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, + ops::LogLossGradOp); +REGISTER_OP_CPU_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..c164a6d04056c2e9d9302b609c1ff4f2b2c4a3f3 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/log_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..67fac7cfe55d1d50063afac925863a8fb2eb63a8 --- /dev/null +++ b/paddle/fluid/operators/log_loss_op.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class LogLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* loss_out = ctx.Output("Loss"); + + loss_out->mutable_data(ctx.GetPlace()); + + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto loss = EigenVector::Flatten(*loss_out); + auto& place = *ctx.template device_context().eigen_device(); + + loss.device(place) = (-(label * (prediction + epsilon).log()) - + ((static_cast(1) - label) * + (static_cast(1) - prediction + epsilon).log())); + } +}; + +template +class LogLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto* dloss = ctx.Input(framework::GradVarName("Loss")); + auto* dpred = ctx.Output(framework::GradVarName("Predicted")); + + auto dl = EigenVector::Flatten(*dloss); + auto& place = *ctx.template device_context().eigen_device(); + + if (dpred) { + dpred->mutable_data(ctx.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + dx.device(place) = dl * (-(label / (prediction + epsilon)) + + ((static_cast(1) - label) / + (static_cast(1) - prediction + epsilon))); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ff49895df1979bde9dc9f9c7b92601a2a65241da --- /dev/null +++ b/paddle/fluid/operators/logical_op.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/logical_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", + string::Sprintf("(LoDTensor) Left hand operand of %s operator", + comment.type)); + AddInput("Y", + string::Sprintf("(LoDTensor) Right hand operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + OpComment comment; + AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class BinaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of %s operator must not be null", comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y), + "The number of elements in X and Y should be same"); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +template +class UnaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + auto dim_x = context->GetInputDim("X"); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class LogicalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // LogicalOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_xor, + "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/logical_op.cu b/paddle/fluid/operators/logical_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b17444061252b714cd9bbaadc7fcf877628ef89 --- /dev/null +++ b/paddle/fluid/operators/logical_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/logical_op.h" + +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/logical_op.h b/paddle/fluid/operators/logical_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f6d5866c2c8e4ce54e9556a1acf69414dba523d2 --- /dev/null +++ b/paddle/fluid/operators/logical_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LogicalAndFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; } +}; + +template +struct LogicalOrFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; } +}; + +template +struct LogicalNotFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a) const { return !a; } +}; + +template +struct LogicalXorFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + return (a || b) && !(a && b); + } +}; + +template +class BinaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* out = context.Output("Out"); + Functor binary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); + } +}; + +template +class UnaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + Functor unary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), + out->mutable_data(context.GetPlace()), unary_func); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::BinaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); + +#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::UnaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c555f1a3fa228215812b7d3291e882b0d42bb64 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lookup_table_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + + ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("W")->type()), + ctx.device_context()); + } +}; + +class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("W", + "An input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "Ids must be a column vector with rank = 2. " + "The 2nd dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(-1); + AddComment(R"DOC( +Lookup Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class LookupTableOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "lookup_table_grad"; } +}; + +class LookupTableOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("W")->type()), + ctx.device_context()); + } +}; + +class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarDesc::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, + ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); +REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, + ops::LookupTableOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, + ops::LookupTableKernel); +REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, + ops::LookupTableGradKernel); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..801adba5a440ebda9506717603e8f9665ea9e6ba --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -0,0 +1,176 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/lookup_table_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +template +__global__ void LookupTable(T* output, const T* table, const int64_t* ids, + const int64_t N, const int64_t K, const int64_t D, + const int64_t padding_idx) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int64_t id = ids[idy]; + PADDLE_ASSERT(id >= 0); + PADDLE_ASSERT(id < N); + T* out = output + idy * D; + const T* tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + if (PaddingFlag) { + if (id == padding_idx) + out[i] = static_cast(0); + else + out[i] = tab[i]; + } else { + out[i] = tab[i]; + } + } + idy += BlockDimY * GridDimX; + } +} + +template +__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, + const int64_t N, const int64_t K, + const int64_t D) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int id = ids[idy]; + PADDLE_ASSERT(id >= 0); + PADDLE_ASSERT(id < N); + const T* out = output + idy * D; + T* tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + } + idy += BlockDimY * GridDimX; + } +} + +template +class LookupTableCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* table_t = context.Input("W"); + auto* ids_t = context.Input("Ids"); + auto* output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); + + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + auto* ids = ids_t->data(); + auto* table = table_t->data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + if (padding_idx == -1) + LookupTable< + T, 128, 8, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable< + T, 128, 8, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); + } +}; + +template +class LookupTableGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto* ids = context.Input("Ids"); + auto* table = context.Input("W"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + auto stream = dev_ctx.stream(); + // copy GPU memory to CPU pinned memory + framework::Vector new_rows; + new_rows.resize(ids_dim[0]); + auto gpu_place = boost::get(context.GetPlace()); + + // TODO(yuyang18): Strange code here. + memory::Copy(platform::CPUPlace(), + new_rows.CUDAMutableData(context.GetPlace()), gpu_place, + ids_data, ids_dim[0] * sizeof(int64_t), stream); + + d_table->set_rows(new_rows); + + auto* d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table->dims()[1]}); + d_table_value->mutable_data(context.GetPlace()); + + auto* d_table_data = d_table_value->data(); + auto* d_output_data = d_output->data(); + PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, + d_output->numel() * sizeof(T), stream); + + } else { + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto d_table_t = context.Output(framework::GradVarName("W")); + + int N = d_table_t->dims()[0]; + int D = d_table_t->dims()[1]; + int K = ids_t->numel(); + const int64_t* ids = ids_t->data(); + const T* d_output = d_output_t->data(); + T* d_table = d_table_t->mutable_data(context.GetPlace()); + + auto t = framework::EigenVector::Flatten(*d_table_t); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + dim3 threads(128, 8); + dim3 grids(8, 1); + LookupTableGrad<<>>( + d_table, d_output, ids, N, K, D); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_grad, + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d264496882a9e1828953d843d4a18fe4c16b1d24 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_op.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +template +class LookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* table_t = context.Input("W"); // float tensor + auto* ids_t = context.Input("Ids"); // int tensor + auto* output_t = context.Output("Out"); // float tensor + int64_t padding_idx = context.Attr("padding_idx"); + + int N = table_t->dims()[0]; + int D = table_t->dims()[1]; + auto* ids = ids_t->data(); + auto* table = table_t->data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + if (padding_idx == -1) { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } else { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + if (ids[i] == padding_idx) { + memset(output + i * D, 0, D * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } + } + } +}; + +template +class LookupTableGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto* ids = context.Input("Ids"); + auto* table = context.Input("W"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + framework::Vector new_rows; + new_rows.reserve(ids_dim[0]); + for (int64_t i = 0; i < ids_dim[0]; i++) { + new_rows.push_back(ids_data[i]); + } + d_table->set_rows(new_rows); + + auto* d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_dim[0], table->dims()[1]}); + d_table_value->mutable_data(context.GetPlace()); + + d_table->set_height(table->dims()[0]); + + auto* d_output_data = d_output->data(); + auto* d_table_data = d_table_value->data(); + + PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + } else { + auto* ids = context.Input("Ids"); + auto* d_output = context.Input(framework::GradVarName("Out")); + auto* d_table = context.Output(framework::GradVarName("W")); + auto* table = context.Input("W"); + + auto* ids_data = ids->data(); + auto ids_dim = ids->dims(); + + int N = table->dims()[0]; + int D = d_output->dims()[1]; + + auto* d_output_data = d_output->data(); + auto* d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + PADDLE_ENFORCE_LT(ids_data[i], N); + PADDLE_ENFORCE_GE(ids_data[i], 0); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c84507f231c6d3c4c5d6e33719fafc2752876f03 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lrn_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + auto x_v = framework::EigenVector::Flatten(input); + + const int start = -(n - 1) / 2; + const int end = start + n; + + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + + auto e_x = framework::EigenTensor::From(input); + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch >= 0 && ch < C) { + auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + s += alpha * r.square(); + } + } + } + } + + auto out_e = framework::EigenVector::Flatten(*out); + out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + } +}; +template struct LRNFunctor; +template struct LRNFunctor; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + T ratio = -2 * alpha * beta; + auto x_g_e = framework::EigenVector::Flatten(*x_g); + x_g_e = x_g_e.constant(0.0); + + auto e_x = framework::EigenTensor::From(x); + auto e_x_g = framework::EigenTensor::From(*x_g); + auto e_out = framework::EigenTensor::From(out); + auto e_out_g = framework::EigenTensor::From(out_g); + auto e_mid = framework::EigenTensor::From(mid); + + const int start = -(n - 1) / 2; + const int end = start + n; + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g = i_mid.pow(-beta) * i_out_g; + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch < 0 || ch >= C) { + continue; + } + + auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g += ratio * c_out_g * c_out * i_x / c_mid; + } + } + } + } +}; +template struct LRNGradFunctor; +template struct LRNGradFunctor; + +class LRNOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LRNOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MidOut"), + "MidOut(Out) of LRNOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); + + ctx->SetOutputDim("Out", x_dim); + ctx->SetOutputDim("MidOut", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class LRNOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input of LRN operator. " + "It must be a 4D tenor with NCHW format."); + AddOutput("Out", + "(Tensor) The output of LRN operator, which is also the 4D " + "tensor with NCHW format."); + AddOutput("MidOut", + "(Tensor) Middle result of LRN operator. It's computed in " + "forward process and also used in backward process."); + + AddAttr("n", + "(int default 5) " + "n is the \"adjacent\" kernel that maps " + "at the same spatial position.") + .SetDefault(5) + .GreaterThan(0); + + AddAttr("k", + "(float, default 2.0) " + "k is the bias.") + .SetDefault(2.0) + .GreaterThan(0.0); + + AddAttr("alpha", + "(float, default 0.0001) " + "alpha is the scale number.") + .SetDefault(0.0001) + .GreaterThan(0.0); + + AddAttr("beta", + "(float, default 0.75) " + "beta is the power number.") + .SetDefault(0.75) + .GreaterThan(0.0); + + AddComment(R"DOC( +Local Response Normalization Operator. + +This operator comes from the paper: +<>. + +The original formula is: + +$$ +Output(i, x, y) = Input(i, x, y) / \left( +k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)} +(Input(j, x, y))^2 +\right)^{\beta} +$$ + +Function implementation: + +Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4. +And dimensions 0 ~ 3 represent batch size, feature maps, rows, +and columns, respectively. + +Input and Output in the formula above is for each map(i) of one image, and +Input(i, x, y), Output(i, x, y) represents an element in an image. + +C is the number of feature maps of one image. n is a hyper-parameter +configured when operator is initialized. The sum in the denominator +is the sum of the same positions in the neighboring maps. + +)DOC"); + } +}; + +class LRNOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker, lrn_grad, ops::LRNOpGrad); +REGISTER_OP_CPU_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..03112bf3e03595a521c22cd914f414f026970c10 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.cu @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lrn_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C, + int H, int W, int size, T k, T alpha) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + + in += offset; + mid += offset; + const int step = H * W; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + + T accum = 0; + int index = 0; + while (index < C + post_pad) { + if (index < C) { + T val = in[index * step]; + accum += val * val; + } + if (index >= size) { + T val = in[(index - size) * step]; + accum -= val * val; + } + if (index >= post_pad) { + mid[(index - post_pad) * step] = k + accum * alpha; + } + ++index; + } + } +} + +template +__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid, + T negative_beta, T* out) { + const int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < input_size) { + out[index] = in[index] * pow(mid[index], negative_beta); + } +} + +template +void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, + T* outputs, T* mid, int N, int C, int H, int W, int n, T k, + T alpha, T beta) { + int img_size = N * H * W; + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + auto& dev_ctx = ctx.template device_context(); + KeCMRNormFillScale<<>>( + img_size, inputs, mid, C, H, W, n, k, alpha); + + int input_size = N * H * W * C; + grid_size = (input_size + block_size - 1) / block_size; + KeCMRNormOutput<<>>( + input_size, inputs, mid, -beta, outputs); +} + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + CrossMapNormal( + ctx, input.data(), out->mutable_data(ctx.GetPlace()), + mid->mutable_data(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta); + } +}; + +template struct LRNFunctor; +template struct LRNFunctor; + +template +__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, + const T* mid, T* x_g, const T* out_g, int C, + int H, int W, int size, T negative_beta, + T ratio) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + x += offset; + out += offset; + mid += offset; + out_g += offset; + x_g += offset; + + const int step = H * W; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + + int index = 0; + T accum = 0; + // TODO(gongwb): optimize this with thread shared array. + while (index < C + post_pad) { + if (index < C) { + x_g[index * step] = 0.0; + accum += out_g[index * step] * out[index * step] / mid[index * step]; + } + if (index >= size) { + accum -= out_g[(index - size) * step] * out[(index - size) * step] / + mid[(index - size) * step]; + } + if (index >= post_pad) { + x_g[(index - post_pad) * step] += + out_g[(index - post_pad) * step] * + pow(mid[(index - post_pad) * step], negative_beta) - + ratio * x[(index - post_pad) * step] * accum; + } + ++index; + } + } +} + +template +void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, + const T* out, const T* mid, T* x_g, const T* out_g, + int N, int C, int H, int W, int n, T alpha, T beta) { + int img_size = N * H * W; + + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + auto& dev_ctx = ctx.template device_context(); + KeCMRNormDiff<<>>( + img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, + 2.0f * alpha * beta); +} + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + CrossMapNormalGrad(ctx, x.data(), out.data(), mid.data(), + x_g->mutable_data(ctx.GetPlace()), out_g.data(), + N, C, H, W, n, alpha, beta); + } +}; + +template struct LRNGradFunctor; +template struct LRNGradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b7b78b459145bae5483e3f12b3d872c679823740 --- /dev/null +++ b/paddle/fluid/operators/lrn_op.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta); +}; + +template +class LRNKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + + // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta) + // x represents inputs + // f(x) represents outputs + void Compute(const framework::ExecutionContext& ctx) const override { + // input + const Tensor& x = *ctx.Input("X"); + auto x_dims = x.dims(); + + // NCHW + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + Tensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + // MidOut save the intermediate result for backward + Tensor* mid = ctx.Output("MidOut"); + mid->mutable_data(ctx.GetPlace()); + + int n = ctx.Attr("n"); + T alpha = ctx.Attr("alpha"); + T beta = ctx.Attr("beta"); + T k = ctx.Attr("k"); + + PADDLE_ENFORCE(n > 0, "n should >= 0"); + PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0"); + PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); + PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); + + LRNFunctor f; + f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); + } +}; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta); +}; + +/** + * \brief Backward calculation for normalization with across maps. + * + * Function implementation: + * + * The implementation of this Function is derived from the + * CrossMapNormalFunc implementation. + * + * InputGrad = OutputGrad * MidOut ^ (-beta) + * -- upper + * + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue + * -- lower + * + * The data of inputs/outputs format is the same as the forward interface + * and is NCHW. + * + * The upper and lower is the same as forward. The logic of the sum + * is also the same as forward. + */ +template +class LRNGradKernel : public framework::OpKernel { + public: + using Tensor = framework::Tensor; + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor& x = *ctx.Input("X"); + const Tensor& out = *ctx.Input("Out"); + const Tensor& out_g = *ctx.Input(framework::GradVarName("Out")); + const Tensor& mid = *ctx.Input("MidOut"); + + auto x_g = ctx.Output(framework::GradVarName("X")); + x_g->mutable_data(ctx.GetPlace()); + + auto x_dims = x.dims(); + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + int n = ctx.Attr("n"); + T alpha = ctx.Attr("alpha"); + T beta = ctx.Attr("beta"); + + LRNGradFunctor f; + f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1f1b5f235f991e7a4d84c815bc5dda74ab64752 --- /dev/null +++ b/paddle/fluid/operators/lstm_op.cc @@ -0,0 +1,281 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_op.h" + +namespace paddle { +namespace operators { + +class LSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(BatchGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchGate) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + int frame_size = in_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], frame_size, + "The first dimension of Input(Weight) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + + framework::DDim out_dims({in_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchGate", in_dims); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->ShareLoD("Input", "Hidden"); + ctx->ShareLoD("Input", "Cell"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the first input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights, which contains two parts: " + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("Hidden", + "(LoDTensor) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the nonlinear computation. This " + "LoDTensor has the same shape as the reorganized input, which " + "is also be called batch input. The LoD size is 2. The first " + "LoD is the batch offsets and the second LoD contains the " + "indexes, which denote the position of reorganized sequence " + "in the raw input.") + .AsIntermediate(); + AddOutput("BatchCellPreAct", + "(LoDTensor) This LoDTensor is obtained in the forward and used " + "in the backward.") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr( + "gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Long-Short Term Memory (LSTM) Operator. + +The defalut implementation is diagonal/peephole connection +(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: + +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\ + +f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\ + +\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\ + +o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\ + +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + +h_t = o_t \odot act_h(c_t) +$$ + +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ +is the non-line activations, such as logistic sigmoid function, and +$i, f, o$ and $c$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as +the cell output activation vector $h$. + +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ +are the cell input and cell output activation functions and `tanh` is usually +used for them. $\tilde{c_t}$ is also called candidate hidden state, +which is computed based on the current input and the previous hidden state. + +Set `use_peepholes` False to disable peephole connection. The formula +is omitted here, please refer to the paper +http://www.bioinf.jku.at/publications/older/2604.pdf for details. + +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. +Users can choose to use fully-connect operator before LSTM operator. + +)DOC"); + } +}; + +class LSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Hidden"), + "Input(Hidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cell"), + "Input(Cell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(BatchGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"), + "Input(BatchGate) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + }; + + SetOutGradDim("Input"); + SetOutGradDim("Weight"); + SetOutGradDim("Bias"); + SetOutGradDim("H0"); + SetOutGradDim("C0"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); +REGISTER_OP_CPU_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..679d02b1f9a1d1f0313b5c8109285c92734e3e5a --- /dev/null +++ b/paddle/fluid/operators/lstm_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c48495533cf5dbf1f46176cff936f7f988a3d48 --- /dev/null +++ b/paddle/fluid/operators/lstm_op.h @@ -0,0 +1,376 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, *dst, indexed_src); +} + +template +class LSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_t0 = ctx.Input("H0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* hidden_out = ctx.Output("Hidden"); + hidden_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + bool is_reverse = ctx.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); + to_batch(device_ctx, *input, *batch_gate, true, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + + if (bias) { + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); + } + + math::LstmMetaValue lstm_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + // the code style in LstmMetaValue will be updated later. + + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + lstm_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batch_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); + lstm_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_hidden, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_hidden.mutable_data(dims, ctx.GetPlace()); + batch_cell.mutable_data(dims, ctx.GetPlace()); + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor out_t = batch_hidden.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_hidden_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } else if (hidden_t0) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + Tensor ordered_h0; + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } + + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); + lstm_value.prev_state_value = lstm_value.state_value; + } + + math::Batch2LoDTensorFunctor to_seq; + batch_hidden.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(device_ctx, batch_hidden, *hidden_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(device_ctx, batch_cell, *cell_out); + } +}; + +template +class LSTMGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_out = ctx.Input("Hidden"); + auto* cell_out = ctx.Input("Cell"); + + auto* batch_gate = ctx.Input("BatchGate"); + auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + + auto* hidden_g = ctx.Input(framework::GradVarName("Hidden")); + + auto* in_g = ctx.Output(framework::GradVarName("Input")); + auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* bias_g = ctx.Output(framework::GradVarName("Bias")); + + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + + auto* h0_g = ctx.Output(framework::GradVarName("H0")); + auto* c0_g = ctx.Output(framework::GradVarName("C0")); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + if (weight_g) { + weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + framework::Vector order(batch_gate->lod()[2]); + + if (c0) { + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); + } + if (c0 && c0_g) { + ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); + } + + auto in_dims = input->dims(); + auto out_dims = hidden_g->dims(); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); + + math::LstmMetaValue lstm_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + + math::LstmMetaGrad lstm_grad; + + if (bias && bias_g) { + bias_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && ctx.Attr("use_peepholes")) { + T* bias_g_data = bias_g->data(); + lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; + lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; + } else { + lstm_grad.check_ig_grad = nullptr; + lstm_grad.check_fg_grad = nullptr; + lstm_grad.check_og_grad = nullptr; + } + + math::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch]( + const DeviceContext& ctx, const framework::LoDTensor& src, + const framework::DDim& dims, framework::LoDTensor& dst) { + dst.mutable_data(dims, ctx.GetPlace()); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, dst, false); + }; + + LoDTensor batch_hidden, batch_hidden_g, batch_cell; + ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden); + ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g); + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); + + LoDTensor batch_cell_g, batch_gate_g; + batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(device_ctx, *cell_g, batch_cell_g, false); + zero(device_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate = batch_gate->Slice(bstart, bend); + Tensor cell = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstm_value.gate_value = gate.data(); + lstm_value.state_value = cell.data(); + lstm_value.state_active_value = cell_pre_act.data(); + + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + Tensor gate_g = batch_gate_g.Slice(bstart, bend); + Tensor cell_g = batch_cell_g.Slice(bstart, bend); + lstm_grad.state_grad = cell_g.data(); + lstm_grad.gate_grad = gate_g.data(); + lstm_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstm_value.prev_state_value = cell_pre.data(); + lstm_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + int cur_batch_size = bend - bstart; + math::LstmUnitGradFunctor::compute( + device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_hidden_g, + static_cast(1.0)); + if (weight_g) { + /* backward weight */ + auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_hidden, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + if (h0 && h0_g) { + ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), + &ordered_h0_g, static_cast(0.0)); + } + } + } + + math::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + in_g->mutable_data(ctx.GetPlace()); + to_seq(device_ctx, batch_gate_g, *in_g); + } + if (bias && bias_g) { + /* backward bias */ + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); + } + if (c0 && c0_g) { + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d33d47e0c3ba8b83be2c06e9884d90e9bb1012e --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstm_unit_op.h" + +namespace paddle { +namespace operators { + +class LstmUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("C_prev"), + "Input(C_prev) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("C"), + "Output(C) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("H"), + "Output(H) of LSTM should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto c_prev_dims = ctx->GetInputDim("C_prev"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0], + "Batch size of inputs and states must be equal"); + PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4, + "Dimension of FC should equal to prev state * 4"); + + int b_size = c_prev_dims[0]; // batch size + int s_dim = c_prev_dims[1]; // state dim + ctx->SetOutputDim("C", {b_size, s_dim}); + ctx->SetOutputDim("H", {b_size, s_dim}); + } +}; + +class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "Lstm unit only applies non-linear activations, please make sure" + "that linear tranformation has already been applied to `X`. " + "Linear tranformation can be applied by adding a `fc` layer"); + AddInput( + "C_prev", + "The cell state tensor of last time-step in the Lstm Unit operator."); + AddOutput("C", "The cell tensor of Lstm Unit operator."); + AddOutput("H", "The hidden state tensor of Lstm Unit operator."); + AddAttr("forget_bias", + "(float, default 0.0) " + "The forget bias of Lstm Unit.") + .SetDefault(0.0); + AddComment(R"DOC( +Lstm Unit Operator + +Equation: + +$$ +i, f, o, j = split(X) \\ +C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\ +H = C * sigm(o) +$$ + +)DOC"); + } +}; + +class LstmUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")), + "Input(C@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")), + "Input(H@GRAD) should not be null"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("C_prev"), + ctx->GetInputDim("C_prev")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad, + ops::LstmUnitGradOp); +REGISTER_OP_CPU_KERNEL(lstm_unit, + ops::LstmUnitKernel, + ops::LstmUnitKernel); +REGISTER_OP_CPU_KERNEL( + lstm_unit_grad, ops::LstmUnitGradKernel, + ops::LstmUnitGradKernel); diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..12ebffca37f995111bbeb2a1e8b30ea2fe35c74d --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* Acknowledgement: the following code is strongly inspired by +https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu +*/ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cross_entropy_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__device__ Dtype cuda_sigmoid(const Dtype x) { + return Dtype(1) / (Dtype(1) + exp(-x)); +} + +template +__device__ Dtype cuda_tanh(const Dtype x) { + return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x)); +} + +template +__global__ void LSTMUnitKernel(const int nthreads, const int dim, + const T* C_prev, const T* X, T* C, T* H, + const T forget_bias) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + + const T* X_offset = X + 4 * dim * n; + const T i = cuda_sigmoid(X_offset[d]); + const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); + const T o = cuda_sigmoid(X_offset[2 * dim + d]); + const T g = cuda_tanh(X_offset[3 * dim + d]); + const T c_prev = C_prev[index]; + const T c = f * c_prev + i * g; + C[index] = c; + const T tanh_c = cuda_tanh(c); + H[index] = o * tanh_c; + } +} + +template +__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim, + const T* C_prev, const T* X, const T* C, + const T* H, const T* C_diff, + const T* H_diff, T* C_prev_diff, + T* X_diff, const T forget_bias) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const T* X_offset = X + 4 * dim * n; + T* c_prev_diff = C_prev_diff + index; + T* X_diff_offset = X_diff + 4 * dim * n; + T* i_diff = X_diff_offset + d; + T* f_diff = X_diff_offset + 1 * dim + d; + T* o_diff = X_diff_offset + 2 * dim + d; + T* g_diff = X_diff_offset + 3 * dim + d; + + const T i = cuda_sigmoid(X_offset[d]); + const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias); + const T o = cuda_sigmoid(X_offset[2 * dim + d]); + const T g = cuda_tanh(X_offset[3 * dim + d]); + const T c_prev = C_prev[index]; + const T c = C[index]; + const T tanh_c = cuda_tanh(c); + const T c_term_diff = + C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[index] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } +} + +template +class LstmUnitOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + auto* x_tensor = ctx.Input("X"); + auto* c_prev_tensor = ctx.Input("C_prev"); + auto* c_tensor = ctx.Output("C"); + auto* h_tensor = ctx.Output("H"); + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int b_size = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + const T* X = x_tensor->data(); + const T* C_prev = c_prev_tensor->data(); + + T* C = c_tensor->mutable_data(ctx.GetPlace()); + T* H = h_tensor->mutable_data(ctx.GetPlace()); + + int block = 512; + int n = b_size * D; + int grid = (n + block - 1) / block; + + LSTMUnitKernel<<>>(n, D, C_prev, X, C, H, forget_bias); + } +}; + +template +class LstmUnitGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + auto x_tensor = ctx.Input("X"); + auto c_prev_tensor = ctx.Input("C_prev"); + auto c_tensor = ctx.Input("C"); + auto h_tensor = ctx.Input("H"); + + auto hdiff_tensor = ctx.Input(framework::GradVarName("H")); + auto cdiff_tensor = ctx.Input(framework::GradVarName("C")); + + auto xdiff_tensor = ctx.Output(framework::GradVarName("X")); + auto c_prev_diff_tensor = + ctx.Output(framework::GradVarName("C_prev")); + + auto* X = x_tensor->data(); + auto* C_prev = c_prev_tensor->data(); + auto* C = c_tensor->data(); + auto* H = h_tensor->data(); + + auto* H_diff = hdiff_tensor->data(); + auto* C_diff = cdiff_tensor->data(); + + auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); + auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); + + int N = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int block = 512; + int n = N * D; + int grid = (n + block - 1) / block; + + LSTMUnitGradientKernel<<>>(n, D, C_prev, X, C, H, C_diff, + H_diff, C_prev_diff, X_diff, + forget_bias); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9f2370fe690a45f49c0138fbd1303d7bfd6dacd0 --- /dev/null +++ b/paddle/fluid/operators/lstm_unit_op.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* Acknowledgement: the following code is strongly inspired by +https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h +*/ + +#pragma once +#include "glog/logging.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template +inline T tanh(T x) { + return 2. * sigmoid(2. * x) - 1.; +} + +template +class LstmUnitKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto* x_tensor = ctx.Input("X"); + auto* c_prev_tensor = ctx.Input("C_prev"); + auto* c_tensor = ctx.Output("C"); + auto* h_tensor = ctx.Output("H"); + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + int b_size = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + T* C = c_tensor->mutable_data(ctx.GetPlace()); + T* H = h_tensor->mutable_data(ctx.GetPlace()); + + const T* X = x_tensor->data(); + const T* C_prev = c_prev_tensor->data(); + + for (int n = 0; n < b_size; ++n) { + for (int d = 0; d < D; ++d) { + const T i = sigmoid(X[d]); + const T f = sigmoid(X[1 * D + d] + forget_bias); + const T o = sigmoid(X[2 * D + d]); + const T g = tanh(X[3 * D + d]); + const T c_prev = C_prev[d]; + const T c = f * c_prev + i * g; + C[d] = c; + const T tanh_c = tanh(c); + H[d] = o * tanh_c; + } + C_prev += D; + X += 4 * D; + C += D; + H += D; + } + } +}; + +template +class LstmUnitGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto x_tensor = ctx.Input("X"); + auto c_prev_tensor = ctx.Input("C_prev"); + auto c_tensor = ctx.Input("C"); + auto h_tensor = ctx.Input("H"); + + auto hdiff_tensor = ctx.Input(framework::GradVarName("H")); + auto cdiff_tensor = ctx.Input(framework::GradVarName("C")); + + auto xdiff_tensor = ctx.Output(framework::GradVarName("X")); + auto c_prev_diff_tensor = + ctx.Output(framework::GradVarName("C_prev")); + + auto* X = x_tensor->data(); + auto* C_prev = c_prev_tensor->data(); + auto* C = c_tensor->data(); + auto* H = h_tensor->data(); + + auto* H_diff = hdiff_tensor->data(); + auto* C_diff = cdiff_tensor->data(); + + auto* C_prev_diff = c_prev_diff_tensor->mutable_data(ctx.GetPlace()); + auto* X_diff = xdiff_tensor->mutable_data(ctx.GetPlace()); + + int N = c_tensor->dims()[0]; + int D = c_tensor->dims()[1]; + + auto forget_bias = static_cast(ctx.Attr("forget_bias")); + + for (int n = 0; n < N; ++n) { + for (int d = 0; d < D; ++d) { + T* c_prev_diff = C_prev_diff + d; + T* i_diff = X_diff + d; + T* f_diff = X_diff + 1 * D + d; + T* o_diff = X_diff + 2 * D + d; + T* g_diff = X_diff + 3 * D + d; + + const T i = sigmoid(X[d]); + const T f = sigmoid(X[1 * D + d] + forget_bias); + const T o = sigmoid(X[2 * D + d]); + const T g = tanh(X[3 * D + d]); + const T c_prev = C_prev[d]; + const T c = C[d]; + const T tanh_c = tanh(c); + const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[d] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } + C_prev += D; + X += 4 * D; + C += D; + H += D; + C_diff += D; + H_diff += D; + X_diff += 4 * D; + C_prev_diff += D; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d30edf5c3cbabe7223a459a5f60b7b9aa51af9a --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.cc @@ -0,0 +1,331 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstmp_op.h" + +namespace paddle { +namespace operators { + +class LSTMPOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Projection"), + "Output(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchCellPreAct) of LSTMP operator should not be " + "null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(BatchHidden) of LSTMP operator should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, + "Input(X)'s rank of LSTMP operator must be 2."); + + int frame_size = in_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + auto proj_dims = ctx->GetInputDim("ProjWeight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1], + "The first dimension of Input(Weight) " + "should be %d.", + proj_dims[1]); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + + PADDLE_ENFORCE_EQ(proj_dims.size(), 2, + "The rank of Input(ProjWeight) should be 2."); + PADDLE_ENFORCE_EQ(proj_dims[0], frame_size, + "The first dimension of Input(ProjWeight) " + "should be %d.", + frame_size); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(C0) of LSTMP operator should not be null after " + "Input(H0) provided."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]}); + } + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + + framework::DDim out_dims({in_dims[0], frame_size}); + framework::DDim proj_out_dims({in_dims[0], proj_dims[1]}); + ctx->SetOutputDim("Projection", proj_out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchGate", in_dims); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("BatchHidden", out_dims); + ctx->ShareLoD("Input", "Projection"); + ctx->ShareLoD("Input", "Cell"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the input for sequence data, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `C0` should not be null if `H0` provided.") + .AsDispensable(); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (P x 4D), where P is the projection layer size " + "and D is the hidden size." + " - Weight = {W_cr, W_ir, W_fr, W_or}"); + AddInput("ProjWeight", + "(Tensor) the learnable weight of the projection layer." + " - The shape is (D x P), where P is the recurrent projection " + "layer size and D is the hidden size." + " - ProjWeight = {W_rh}"); + AddInput("Bias", + "(Tensor) the learnable biases, which contains two parts: " + "input-hidden biases and peephole connections weights if " + "setting `use_peepholes` to `True`. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("Projection", + "(LoDTensor) the projection of the hidden state of LSTMP " + "operator. The shape is (T x P), and LoD is the same with the " + "`Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state of LSTMP operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the activations. This LoDTensor has the " + "same shape as the reorganized input, which is also be called " + "batch input. The LoD size is 2. The first-level LoD is the " + "batch offsets and the second contains the indices, which " + "denotes the position of reorganized sequence in the raw input.") + .AsIntermediate(); + AddOutput("BatchCellPreAct", + "(LoDTensor) the pre-activation cell state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("BatchHidden", + "(LoDTensor) the hidden state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("OrderedP0", + "(Tensor) the projection of the initial hidden state " + "H0. This is a tensor with shape (N x P), where N is the " + "batch size and P is the hidden size.") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTMP.") + .SetDefault(false); + AddAttr( + "gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("proj_activation", + "(string, default: tanh)" + "The activation for projection output, " + "`tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator. + +LSTMP has a separate projection layer after the LSTM layer, projecting the +original hidden state to a lower-dimensional one, which is proposed to reduce +the number of total parameters and furthermore computational complexity for +the LSTM, espeacially for the case that the size of output units is relative +large (https://research.google.com/pubs/archive/43905.pdf). + +The formula is as follows: + +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\ + +f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\ + +\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\ + +o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\ + +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + +h_t = o_t \odot act_h(c_t) \\ + +r_t = \overline{act_h}(W_{rh}h_t) +$$ + +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ +is the activation, such as logistic sigmoid function, and +$i, f, o$ and $c$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as +the cell output activation vector $h$. Here $h$ is usually called the hidden +state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also +called the candidate hidden state, whose computation is based on the current +input and previous hidden state. + +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ +are the cell input and cell output activation functions and `tanh` is usually +used for them. $\overline{act_h}$ is the activation function for the +projection output, usually using `identity` or same as $act_h$. + +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. +Users can choose to use fully-connected operator before LSTMP operator. + +)DOC"); + } +}; + +class LSTMPGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Projection"), + "Input(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cell"), + "Input(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"), + "Input(BatchGate) of LSTMP operator should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + }; + + SetOutGradDim("Input"); + SetOutGradDim("Weight"); + SetOutGradDim("ProjWeight"); + SetOutGradDim("Bias"); + SetOutGradDim("H0"); + SetOutGradDim("C0"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad, + ops::LSTMPGradOp); +REGISTER_OP_CPU_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CPU_KERNEL( + lstmp_grad, ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bcefb94c75b8577fefb1ee3b440dc5fb045562d5 --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lstmp_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CUDA_KERNEL( + lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..22ef4721860a493fded98cf32b40a2aceb851a5c --- /dev/null +++ b/paddle/fluid/operators/lstmp_op.h @@ -0,0 +1,496 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index, *dst, indexed_src); +} + +template +class LSTMPKernel : public framework::OpKernel { + public: + template + void ActCompute(const math::detail::ActivationType act_type, const Device& d, + X x, Y y) const { + if (act_type == math::detail::ActivationType::kIdentity) + y.device(d) = x; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kTanh) + TanhFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kReLU) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_t0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Output("OrderedP0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* proj_out = ctx.Output("Projection"); + proj_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + bool is_reverse = ctx.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); + to_batch(device_ctx, *input, *batch_gate, true, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + + if (bias) { + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); + } + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + // the code style in LstmpMetaValue will be updated later. + + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + lstmp_value.prev_state_value = nullptr; + Tensor ordered_c0; + + framework::Vector order(batch_gate->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); + lstmp_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_proj, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + auto* batch_hidden = ctx.Output("BatchHidden"); + batch_hidden->mutable_data(dims, ctx.GetPlace()); // T x D + batch_proj.mutable_data(proj_dims, ctx.GetPlace()); // T x P + batch_cell.mutable_data(dims, ctx.GetPlace()); // T x D + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + Tensor proj_t = batch_proj.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } else if (hidden_t0) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + + Tensor ordered_h0; + ordered_proj0->mutable_data(ctx.GetPlace()); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, + *proj_weight, false, static_cast(1.0), + ordered_proj0, static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + ActCompute(cell_act, place, proj0_dev, proj0_dev); + } + math::matmul(device_ctx, *ordered_proj0, false, + *weight, false, static_cast(1.0), + &gate_t, static_cast(1.0)); + } + + lstmp_value.gate_value = gate_t.data(); + lstmp_value.output_value = hidden_t.data(); + lstmp_value.state_value = cell_t.data(); + lstmp_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); + lstmp_value.prev_state_value = lstmp_value.state_value; + math::matmul(device_ctx, hidden_t, false, *proj_weight, + false, static_cast(1.0), &proj_t, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj_t_dev = EigenMatrix::From(proj_t); + ActCompute(cell_act, place, proj_t_dev, proj_t_dev); + } + } + + math::Batch2LoDTensorFunctor to_seq; + batch_proj.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(device_ctx, batch_proj, *proj_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(device_ctx, batch_cell, *cell_out); + } +}; + +template +class LSTMPGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const math::detail::ActivationType act_type, + const Device& d, X x, Y y, DX dx, DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == math::detail::ActivationType::kIdentity) + dx.device(d) = dy; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kTanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kReLU) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* proj_out = ctx.Input("Projection"); + auto* cell_out = ctx.Input("Cell"); + + auto* batch_gate = ctx.Input("BatchGate"); + auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + auto* batch_hidden = ctx.Input("BatchHidden"); + + auto* projection_g = + ctx.Input(framework::GradVarName("Projection")); + + auto* in_g = ctx.Output(framework::GradVarName("Input")); + auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* proj_weight_g = + ctx.Output(framework::GradVarName("ProjWeight")); + auto* bias_g = ctx.Output(framework::GradVarName("Bias")); + + auto* h0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Input("OrderedP0"); + auto* c0 = ctx.Input("C0"); + + auto* h0_g = ctx.Output(framework::GradVarName("H0")); + auto* c0_g = ctx.Output(framework::GradVarName("C0")); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + if (weight_g) { + weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, weight_g, static_cast(0.0)); + } + if (proj_weight_g) { + proj_weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, proj_weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + + framework::Vector order(batch_gate->lod()[2]); + + if (c0) { + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); + } + if (c0 && c0_g) { + ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); + } + + auto in_dims = input->dims(); + auto out_dims = cell_out->dims(); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + + math::LstmMetaGrad lstmp_grad; + + if (bias && bias_g) { + bias_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && ctx.Attr("use_peepholes")) { + T* bias_g_data = bias_g->data(); + lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size; + lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size; + } else { + lstmp_grad.check_ig_grad = nullptr; + lstmp_grad.check_fg_grad = nullptr; + lstmp_grad.check_og_grad = nullptr; + } + + math::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch]( + const DeviceContext& ctx, const framework::LoDTensor& src, + const framework::DDim& dims, framework::LoDTensor& dst) { + dst.mutable_data(dims, ctx.GetPlace()); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, dst, false); + }; + + LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell; + batch_hidden_g.mutable_data(out_dims, ctx.GetPlace()); + ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P + ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D + + LoDTensor batch_cell_g, batch_gate_g; + batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(device_ctx, *cell_g, batch_cell_g, false); + zero(device_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor cur_proj = batch_proj.Slice(bstart, bend); + Tensor proj_g = batch_proj_g.Slice(bstart, bend); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto cur_proj_dev = EigenMatrix::From(cur_proj); + auto proj_g_dev = EigenMatrix::From(proj_g); + ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, + proj_g_dev); + } + /* hidden state backwarad */ + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + math::matmul(device_ctx, proj_g, false, *proj_weight, + true, static_cast(1.0), &out_g, + static_cast(0.0)); + /* projection weight backward*/ + if (proj_weight_g) { + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + math::matmul(device_ctx, hidden_t, true, proj_g, + false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + + Tensor gate = batch_gate->Slice(bstart, bend); + Tensor cell = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstmp_value.gate_value = gate.data(); + lstmp_value.state_value = cell.data(); + lstmp_value.state_active_value = cell_pre_act.data(); + + Tensor gate_g = batch_gate_g.Slice(bstart, bend); + Tensor cell_g = batch_cell_g.Slice(bstart, bend); + lstmp_grad.state_grad = cell_g.data(); + lstmp_grad.gate_grad = gate_g.data(); + lstmp_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstmp_value.prev_state_value = cell_pre.data(); + lstmp_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstmp_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + int cur_batch_size = bend - bstart; + math::LstmUnitGradFunctor::compute( + device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_proj_g, + static_cast(1.0)); + if (weight_g) { + /* weight backward*/ + auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + if (weight_g) { + math::matmul(device_ctx, *ordered_proj0, true, + gate_g, false, static_cast(1.0), + weight_g, static_cast(1.0)); + } + } + if (h0 && (h0_g || proj_weight_g)) { + ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + Tensor proj0_g; + proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); + proj0_g.mutable_data(ctx.GetPlace()); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), &proj0_g, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + auto proj0_g_dev = EigenMatrix::From(proj0_g); + ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev, + proj0_g_dev); + } + if (h0_g) { + math::matmul( + device_ctx, proj0_g, false, *proj_weight, true, + static_cast(1.0), &ordered_h0_g, static_cast(0.0)); + } + if (proj_weight_g) { + math::matmul(device_ctx, ordered_h0, true, + proj0_g, false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + } + } + } + + math::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + in_g->mutable_data(ctx.GetPlace()); + to_seq(device_ctx, batch_gate_g, *in_g); + } + if (bias && bias_g) { + /* backward bias */ + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); + } + if (c0 && c0_g) { + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc31befb20526e84aae1804756d2d44a785aa229 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -0,0 +1,122 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_rank_loss_op.h" + +namespace paddle { +namespace operators { + +class MarginRankLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + auto label_dims = ctx->GetInputDim("Label"); + auto x1_dims = ctx->GetInputDim("X1"); + auto x2_dims = ctx->GetInputDim("X2"); + PADDLE_ENFORCE( + (label_dims == x1_dims) && (x1_dims == x2_dims) && + (label_dims.size() == 2) && (label_dims[1] == 1), + "All inputs must be 2-D tensor with shape [batch_size x 1]."); + ctx->SetOutputDim("Activated", label_dims); + ctx->SetOutputDim("Out", label_dims); + } +}; + +template +class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X1", + "(2-D tensor with shape [batch_size x 1]) The score for " + "one item X1 to be ranked, from pairwise ranking model."); + AddInput("X2", + "(2-D tensor with shape [batch_size x 1]) The score for " + "another item X2 to be ranked, from pairwise ranking model."); + AddInput("Label", + "(2-D tensor with shape [batch_size x 1]) " + "The label indicating X1 ranked higher than X2 or not, " + "can only be +1 or -1."); + AddOutput("Activated", + "(2-D tensor with shape [batch_size x 1]) Intermediate tensor " + "to indicate whether each element of Output(Out) is activated.") + .AsIntermediate(); + AddOutput("Out", + "(2-D tensor with shape [batch_size x 1]) " + "The output loss of MarginRankLoss operator."); + AddAttr("margin", "(scalar, default 0) Margin for MarginRankLossOp.") + .SetDefault(static_cast(0)); + AddComment(R"DOC( +MarginRankLoss Operator. + +This operator measures the loss given a pair of training sample +{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` +indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss +is calculated as: + +$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$ + +The attribute `margin` here helps make the predictions more robust. +Denote the item ranked higher as the positive sample, otherwise the negative +sample. If the score of the two samples satisfies + +$positive sample - negative sample < margin$ + +the pair of samples will contribute to the final loss, which will backpropagate +and train the ranking model to enlarge the difference between the two scores. + +For batch input with size `batch_size`, `X1`, `X2` and `Label` +all have the same shape [batch_size x 1]. + +)DOC"); + } +}; + +class MarginRankLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Activated"), + "Intermediate(Activated) shouldn't be null."); + auto dims = ctx->GetInputDim("Label"); + ctx->SetOutputDim(framework::GradVarName("X1"), dims); + ctx->SetOutputDim(framework::GradVarName("X2"), dims); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp, + ops::MarginRankLossOpMaker, margin_rank_loss_grad, + ops::MarginRankLossGradOp); +REGISTER_OP_CPU_KERNEL( + margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CPU_KERNEL( + margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca4593a48d6d3eccff81dfd621ea1198e5bad880 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_rank_loss_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL( + margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..934a5da0f804f7cf7dc176a9ee4e1b72261ef008 --- /dev/null +++ b/paddle/fluid/operators/margin_rank_loss_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +struct ReLU { + HOSTDEVICE T operator()(const T& val) const { + return val > 0 ? val : static_cast(0); + } +}; + +template +struct Heaviside { + HOSTDEVICE T operator()(const T& val) const { + return static_cast(val > 0 ? 1 : 0); + } +}; + +template +class MarginRankLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* act_t = ctx.Output("Activated"); + + auto* label_t = ctx.Input("Label"); + auto* x1_t = ctx.Input("X1"); + auto* x2_t = ctx.Input("X2"); + + out_t->mutable_data(ctx.GetPlace()); + act_t->mutable_data(ctx.GetPlace()); + + auto margin = static_cast(ctx.Attr("margin")); + auto out = framework::EigenVector::Flatten(*out_t); + auto act = framework::EigenVector::Flatten(*act_t); + + auto label = framework::EigenVector::Flatten(*label_t); + auto x1 = framework::EigenVector::Flatten(*x1_t); + auto x2 = framework::EigenVector::Flatten(*x2_t); + + auto& dev = *ctx.template device_context().eigen_device(); + out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); + act.device(dev) = out.unaryExpr(Heaviside()); + } +}; + +template +class MarginRankLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_x1_t = + ctx.Output(framework::GradVarName("X1")); + auto* d_x2_t = + ctx.Output(framework::GradVarName("X2")); + + auto* act_t = ctx.Input("Activated"); + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* label_t = ctx.Input("Label"); + + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto act = framework::EigenVector::Flatten(*act_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto& dev = *ctx.template device_context().eigen_device(); + + // compute d_x1 + if (d_x1_t) { + d_x1_t->mutable_data(ctx.GetPlace()); + auto d_x1 = framework::EigenVector::Flatten(*d_x1_t); + d_x1.device(dev) = -d_out * act * label; + } + // compute d_x2 + if (d_x2_t) { + d_x2_t->mutable_data(ctx.GetPlace()); + auto d_x2 = framework::EigenVector::Flatten(*d_x2_t); + d_x2.device(dev) = d_out * act * label; + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt similarity index 100% rename from paddle/operators/math/CMakeLists.txt rename to paddle/fluid/operators/math/CMakeLists.txt diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc new file mode 100644 index 0000000000000000000000000000000000000000..b73d976d1b3e6dcf99e5cc525263282b1253c600 --- /dev/null +++ b/paddle/fluid/operators/math/context_project.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/context_project.h" + +namespace paddle { +namespace operators { +namespace math { + +template class ContextProjectFunctor; +template class ContextProjectFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu new file mode 100644 index 0000000000000000000000000000000000000000..bbd36a6e8f54833f15ee0c991228c10b7f74f272 --- /dev/null +++ b/paddle/fluid/operators/math/context_project.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/math/context_project.h" + +namespace paddle { +namespace operators { +namespace math { + +template class ContextProjectFunctor; +template class ContextProjectFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe593ec3af9d07a2cbafc69e8d3a52e2c43e76b --- /dev/null +++ b/paddle/fluid/operators/math/context_project.h @@ -0,0 +1,306 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +/* + * \brief Context projection concatenates features in adjacent time-steps in + * a sequence. The i-th row of the output is the concatenation of + * context_length rows of the input. The context_length rows are the + * consecutive rows from the i+shift_start row. + * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor. + * + * \param in Input data. + * \param Shape The shape of Input data: + * [mini-batch, input_hidden_size]. + * + * \param padding_data Padding data. + * \param Shape The shape of Padding data: + * [up_pad + down_pad, input_hidden_size]. + * + * \param col Col data. + * \param Shape The shape of Col data: + * [mini-batch, context_length * input_hidden_size]. + * + * For a mini-batch of 2 variable lengths sentences, containing 3, and 1 + * time-steps: + * + * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3, + * 4]. + * Besides, for the sake of simplicity, we assume M=1 and N=2. + * + * X = [[a1, a2; + * b1, b2; + * c1, c2] + * [d1, d2]] + * + * This is to say that input (X) has 4 words and the dimension of each word + * representation is 2. + * + * - Case1: + * If context_start is -1 and padding_trainable is false, we use zero to pad + * instead of learned weight to pad, + * and the context_length is 3, the output (Out) is: + * + * Out =[[0, 0, a1, a2, b1, b2; + * a1, a2, b1, b2, c1, c2; + * b1, b2, c1, c2, 0, 0 ] + * [0, 0, d1, d2, 0, 0 ]] + * + * - Case2: + * If context_start is -1 and padding_trainable is true, we use learned weight + * to pad, + * and the context_length is 3, the output (Out) is: + * + * Out = [[w1, w2, a1, a2, b1, b2; + * a1, a2, b1, b2, c1, c2; + * b1, b2, c1, c2, w3, w4] + * [w1, w2, d1, d2, w3, w4]] + * + */ + +template +class ContextProjectFunctor { + public: + void operator()(const DeviceContext& context, const LoDTensor& in, + const Tensor& padding_data, bool padding_trainable, + const int context_start, const int context_length, + const int context_stride, const int up_pad, + const int down_pad, Tensor* col) { + auto lod_level_0 = in.lod()[0]; + + math::Im2ColFunctor im2col_ocf; + + std::vector dilation({1, 1}); + std::vector padding({up_pad, 0, down_pad, 0}); + std::vector stride({context_stride, 1}); + + int input_row_begin, input_row_end; + int sequence_height, sequence_width; + sequence_width = in.dims()[1]; + + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + input_row_begin = (context_start > 0) + ? static_cast(lod_level_0[i]) + context_start + : static_cast(lod_level_0[i]); + input_row_end = static_cast(lod_level_0[i + 1]); + + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + if (input_row_begin < input_row_end) { + Tensor in_t = in.Slice(input_row_begin, input_row_end); + + std::vector output_shape( + {sequence_height, 1, 1, context_length, + sequence_width}); // output_height, output_width, + // input_channels, filter_height, filter_width + out_t.Resize(framework::make_ddim(output_shape)); + + std::vector input_shape( + {1, input_row_end - input_row_begin, + sequence_width}); // input_channels, input_height, input_width + in_t.Resize(framework::make_ddim(input_shape)); + im2col_ocf(context, in_t, dilation, stride, padding, &out_t); + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + if (padding_trainable) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + // add up trainable data + out_t.Resize({sequence_height * context_length, sequence_width}); + + if (up_pad > 0) { // add up pad + int padding_rows = std::min( + up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); + + for (int k = 0; k < padding_rows; ++k) { + int padding_size = + k + context_length < up_pad ? context_length : up_pad - k; + Tensor out_t_sub = out_t.Slice(k * context_length, + k * context_length + padding_size); + Tensor w_sub = padding_data.Slice(k, k + padding_size); + framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub); + } + } + if (down_pad > 0) { // add down pad + int down_pad_begin_row = + std::max(0, + (sequence_height - context_start - context_length) + 1) + + 1; + int padding_begin = std::max(0, context_start - sequence_height); + int padding_size = + sequence_height - context_start >= context_length + ? 1 + : context_length - (sequence_height - context_start); + if (context_start >= sequence_height) padding_size = context_length; + int padding_idx = padding_begin; + for (int t = 0; t + down_pad_begin_row <= sequence_height; + ++t, ++padding_size) { + if (context_start >= sequence_height) padding_size = context_length; + if (padding_size > context_length) { + padding_size = context_length; + padding_idx++; + } + if (padding_begin > 0 || sequence_height == context_start) + padding_idx = padding_begin + t; + + Tensor out_t_sub = out_t.Slice( + (down_pad_begin_row + t) * context_length - padding_size, + (down_pad_begin_row + t) * context_length); + Tensor w_sub = padding_data.Slice( + up_pad + padding_idx, up_pad + padding_idx + padding_size); + framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub); + } + } + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } +}; + +template +class ContextProjectGradFunctor { + public: + void operator()(const DeviceContext& context, const LoDTensor& in, + bool padding_trainable, const int context_start, + const int context_length, const int context_stride, + const int up_pad, const int down_pad, bool pad_grad, + bool input_grad, Tensor* padding_data, Tensor* col) { + auto lod_level_0 = in.lod()[0]; + + math::Col2ImFunctor col2im_ocf; + + std::vector dilation({1, 1}); + std::vector padding({up_pad, 0, down_pad, 0}); + std::vector stride({context_stride, 1}); + + int input_row_begin, input_row_end; + int sequence_height, sequence_width; + sequence_width = in.dims()[1]; + + if (input_grad) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + input_row_begin = (context_start > 0) + ? static_cast(lod_level_0[i]) + context_start + : static_cast(lod_level_0[i]); + input_row_end = static_cast(lod_level_0[i + 1]); + + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + + if (input_row_begin < input_row_end) { + Tensor in_t = in.Slice(input_row_begin, input_row_end); + + std::vector output_shape( + {sequence_height, 1, 1, context_length, + sequence_width}); // output_height, output_width, + // input_channels, filter_height, filter_width + out_t.Resize(framework::make_ddim(output_shape)); + + std::vector input_shape( + {1, input_row_end - input_row_begin, + sequence_width}); // input_channels, input_height, input_width + in_t.Resize(framework::make_ddim(input_shape)); + + col2im_ocf(context, out_t, dilation, stride, padding, &in_t); + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } + if (pad_grad) { + if (padding_trainable) { + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor out_t = col->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + + sequence_height = static_cast(out_t.dims()[0]); + out_t.Resize({sequence_height * context_length, sequence_width}); + + if (up_pad > 0) { + int padding_rows = std::min( + up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); + + for (int k = 0; k < padding_rows; ++k) { + int padding_size = + k + context_length < up_pad ? context_length : up_pad - k; + Tensor out_t_sub = out_t.Slice(k * context_length, + k * context_length + padding_size); + Tensor w_sub = padding_data->Slice(k, k + padding_size); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); + } + } + if (down_pad > 0) { + int down_pad_begin_row = + std::max( + 0, (sequence_height - context_start - context_length) + 1) + + 1; + int padding_begin = std::max(0, context_start - sequence_height); + int padding_size = + sequence_height - context_start >= context_length + ? 1 + : context_length - (sequence_height - context_start); + if (context_start >= sequence_height) padding_size = context_length; + int padding_idx = padding_begin; + for (int t = 0; t + down_pad_begin_row <= sequence_height; + ++t, ++padding_size) { + if (context_start >= sequence_height) + padding_size = context_length; + if (padding_size > context_length) { + padding_size = context_length; + padding_idx++; + } + if (padding_begin > 0 || sequence_height == context_start) + padding_idx = padding_begin + t; + + Tensor out_t_sub = out_t.Slice( + (down_pad_begin_row + t) * context_length - padding_size, + (down_pad_begin_row + t) * context_length); + Tensor w_sub = padding_data->Slice( + up_pad + padding_idx, up_pad + padding_idx + padding_size); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); + } + } + out_t.Resize({sequence_height, context_length * sequence_width}); + } + } + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..701a9c23c0da3afbb643e9a821b7b74e69170710 --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cos_sim_functor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimDyFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } + } + } +}; + +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu new file mode 100644 index 0000000000000000000000000000000000000000..0323680870ad835afca5a896f80d3abde0aad11c --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cos_sim_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); + } + } +} + +template +struct CosSimDyFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } +}; + +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..445d94f975f3448cc09c21be8e0a13d73d002382 --- /dev/null +++ b/paddle/fluid/operators/math/cos_sim_functor.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; + T xx = 0, xy = 0, yy = 0; + if (same_row) { + auto* y = y_ + cols_ * row_id; + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } else { // This can be wrote in a better way. + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y_[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } + } + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; + + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; + auto reciprocal_x_norm_square = 1 / x_norm_square; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm, + const T* x, const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc new file mode 100644 index 0000000000000000000000000000000000000000..76abd03ff8b75e595461f41301c41ffe57d78686 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cross_entropy.h" + +namespace paddle { +namespace operators { +namespace math { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class CrossEntropyFunctor { + public: + void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, + const framework::Tensor* prob, + const framework::Tensor* labels, const bool softLabel) { + const int batch_size = prob->dims()[0]; + if (softLabel) { + auto in = EigenMatrix::From(*prob); + auto lbl = EigenMatrix::From(*labels); + auto loss = EigenMatrix::From(*out); + + loss.device(*ctx.eigen_device()) = + -((lbl * in.log().unaryExpr(math::TolerableValue())) + .sum(Eigen::DSizes(1)) + .reshape(Eigen::DSizes(batch_size, 1))); + } else { + const int class_num = prob->dims()[1]; + const T* prob_data = prob->data(); + T* loss_data = out->data(); + + const int64_t* label_data = labels->data(); + for (int i = 0; i < batch_size; ++i) { + int index = i * class_num + label_data[i]; + loss_data[i] = -math::TolerableValue()(std::log(prob_data[index])); + } + } + } +}; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu new file mode 100644 index 0000000000000000000000000000000000000000..39222c484c2fe847aec70b65d3d01745b8eea336 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cross_entropy.h" + +namespace paddle { +namespace operators { +namespace math { + +namespace { +template +__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, + const int N, const int D) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { + PADDLE_ASSERT(label[i] >= 0 && label[i] < D); + Y[i] = -math::TolerableValue()(log(X[i * D + label[i]])); + } +} + +template +__device__ __forceinline__ T sum_single_warp(T val) { + val += __shfl_down(val, 16); + val += __shfl_down(val, 8); + val += __shfl_down(val, 4); + val += __shfl_down(val, 2); + val += __shfl_down(val, 1); + return val; +} + +// CUDA do not support dynamic arrary in template +// https://stackoverflow.com/questions/20497209 +template +struct SharedMemory { + // Ensure that we won't compile any un-specialized types + __device__ T* GetPointer() { return NULL; } +}; + +template <> +struct SharedMemory { + __device__ float* GetPointer() { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMemory { + __device__ double* GetPointer() { + extern __shared__ double s_double[]; + return s_double; + } +}; + +template +__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, + const int class_num) { + int tid = threadIdx.x; + SharedMemory d_sum_shared; + T* d_sum = d_sum_shared.GetPointer(); + d_sum[tid] = 0; + + int cur_idx = tid; + int next_idx = blockIdx.x * class_num + tid; + while (cur_idx < class_num) { + d_sum[tid] += + math::TolerableValue()(std::log(X[next_idx])) * label[next_idx]; + next_idx += blockDim.x; + cur_idx += blockDim.x; + } + __syncthreads(); + + for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { + if (tid < stride) d_sum[tid] += d_sum[tid + stride]; + __syncthreads(); + } + + T val = d_sum[tid]; + val = sum_single_warp(val); + if (tid == 0) Y[blockIdx.x] = -val; +} +} // namespace + +using Tensor = framework::Tensor; + +template +class CrossEntropyFunctor { + public: + void operator()(const platform::CUDADeviceContext& ctx, + framework::Tensor* out, const framework::Tensor* prob, + const framework::Tensor* labels, bool softLabel) { + const T* prob_data = prob->data(); + T* loss_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = prob->dims()[0]; + int class_num = prob->dims()[1]; + + if (softLabel) { + const T* label_data = labels->data(); + int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); + + SoftCrossEntropyKernel<<< + batch_size, block, block * sizeof(T), + reinterpret_cast(ctx).stream()>>>( + loss_data, prob_data, label_data, class_num); + } else { + const int64_t* label_data = labels->data(); + int block = 512; + int grid = (batch_size + block - 1) / block; + CrossEntropyKernel<<>>( + loss_data, prob_data, label_data, batch_size, class_num); + } + } +}; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe216a805383ae0d7e8d008af2838652fcf87c6 --- /dev/null +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class CrossEntropyFunctor { + public: + void operator()(const DeviceContext& context, framework::Tensor* out, + const framework::Tensor* prob, + const framework::Tensor* labels, const bool softLabel); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b75e593071eaeb72bcfc687b6ff22b7cf4f143f --- /dev/null +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +// A Cuda kernel to compute the depthwise convolution forward pass +// in NCHW format. +template +__global__ void KernelDepthwiseConv( + const int nthreads, const T* const input_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T* const output_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if (index < nthreads) { + const int batch = index / output_channels / output_height / output_width; + const int c_out = (index / output_height / output_width) % output_channels; + const int h_out = (index / output_width) % output_height; + const int w_out = index % output_width; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height; + const int w_in_end = w_in_start + filter_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + value += + weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * + input_data[offset]; + } + } + output_data[index] = value; + } +} + +// CUDA kernel to compute the depthwise convolution backprop w.r.t input. +template +__global__ void KernelDepthwiseConvInputGrad( + const int nthreads, const T* const output_grad_data, + const T* const filter_data, const int batch_size, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const input_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int batch = index / input_channels / input_height / input_width; + const int c_in = (index / input_height / input_width) % input_channels; + const int h_in = (index / input_width) % input_height; + const int w_in = index % input_width; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + (h_in - filter_height + padding_height + stride_height) / stride_height; + h_out_start = 0 > h_out_start ? 0 : h_out_start; + + int h_out_end = (h_in + padding_height) / stride_height; + h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; + + int w_out_start = + (w_in - filter_width + padding_width + stride_width) / stride_width; + w_out_start = 0 > w_out_start ? 0 : w_out_start; + + int w_out_end = (w_in + padding_width) / stride_width; + w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { + const int filter_h = h_in + padding_height - h_out * stride_height; + for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { + const int filter_w = w_in + padding_width - w_out * stride_width; + const int filter_offset = c_out * filter_height * filter_width + + filter_h * filter_width + filter_w; + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + h_out) * + output_width + + w_out; + value += + output_grad_data[output_grad_offset] * filter_data[filter_offset]; + } + } + } + input_grad_data[index] += value; + } +} + +// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +template +__global__ void KernelDepthwiseConvFilterGrad( + const int nthreads, const T* const output_grad_data, + const T* const input_data, const int num, const int output_channels, + const int output_height, const int output_width, const int input_channels, + const int input_height, const int input_width, const int filter_multiplier, + const int filter_height, const int filter_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* const filter_grad_data) { + int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < nthreads) { + const int w_out = index % output_width; + const int h_out = (index / output_width) % output_height; + const int c_out = (index / output_width / output_height) % output_channels; + const int batch = (index / output_width / output_height / output_channels); + const int c_in = c_out / filter_multiplier; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = + -padding_height + h_out * stride_height + filter_height; + const int w_in_end = -padding_width + w_out * stride_width + filter_width; + const int in_offset = + (batch * input_channels + c_in) * input_height * input_width; + + T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + + for (int h_in = h_start; h_in < h_end; h_in++) { + for (int w_in = w_start; w_in < w_end; w_in++) { + const int offset = in_offset + h_in * input_width + w_in; + const T diff_temp = output_grad_data[index] * input_data[offset]; + T* addr = addr_offset + (h_in - h_in_start) * filter_width + + (w_in - w_in_start); + paddle::platform::CudaAtomicAdd(addr, diff_temp); + } + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* filter_data = filter.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConv<<>>( + nthreads, input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + output_data); + } +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter.dims()[2]; + const int ksize_width = filter.dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* filter_data = filter.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvInputGrad<<>>( + nthreads, output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = filter_grad->dims()[2]; + const int ksize_width = filter_grad->dims()[3]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_grad_data = output_grad.data(); + T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelDepthwiseConvFilterGrad<<>>( + nthreads, output_grad_data, input_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + output_channels / input_channels, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + filter_grad_data); + } +}; + +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; + +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; + +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h new file mode 100644 index 0000000000000000000000000000000000000000..c3081e7a0deb4afc47d826753ecb2556aa6f4522 --- /dev/null +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* filter_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/fluid/operators/math/detail/CMakeLists.txt similarity index 100% rename from paddle/operators/math/detail/CMakeLists.txt rename to paddle/fluid/operators/math/detail/CMakeLists.txt diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..3af7ba790c489b2fc34b3cb6d56849ce789d2430 --- /dev/null +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -0,0 +1,191 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hostdevice.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +enum ActivationType { + kSigmoid, + kReLU, + kTanh, + kIdentity, +}; + +inline ActivationType GetActivationType(const std::string &type) { + if (type == "sigmoid") { + return ActivationType::kSigmoid; + } else if (type == "relu") { + return ActivationType::kReLU; + } else if (type == "tanh") { + return ActivationType::kTanh; + } else if (type == "identity" || type == "") { + return ActivationType::kIdentity; + } + PADDLE_THROW("Not support type %s.", type); +} + +namespace forward { + +template +DEVICE T Identity(const T a) { + return a; +} + +template +DEVICE T Relu(const T a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +template +DEVICE T Sigmoid(const T a) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + T tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); +} + +template +DEVICE T Tanh(const T a) { + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +} // namespace forward + +namespace backward { + +template +DEVICE T Identity(const T a, const T b) { + return a; +} + +template +DEVICE T Relu(const T a, const T b) { + return a * (b > 0.0 ? 1.0 : 0.0); +} + +template +DEVICE T Sigmoid(const T a, const T b) { + return a * b * (1.0 - b); +} + +template +DEVICE T Tanh(const T a, const T b) { + return a * (1.0 - b * b); +} + +} // namespace backward + +template +struct Active { + typedef T (*Act)(T); + typedef T (*ActGrad)(T, T); +}; + +static DEVICE Active::Act kActFloat[] = { + &forward::Sigmoid, &forward::Relu, &forward::Tanh, + &forward::Identity}; + +static DEVICE Active::ActGrad kActGradFloat[] = { + &backward::Sigmoid, &backward::Relu, &backward::Tanh, + &backward::Identity}; + +static DEVICE Active::Act kActDouble[] = { + &forward::Sigmoid, &forward::Relu, &forward::Tanh, + &forward::Identity}; + +static DEVICE Active::ActGrad kActGradDouble[] = { + &backward::Sigmoid, &backward::Relu, + &backward::Tanh, &backward::Identity}; + +namespace forward { +inline DEVICE float activation(float a, int index) { + return kActFloat[index](a); +} + +inline DEVICE double activation(double a, int index) { + return kActDouble[index](a); +} + +} // namespace forward + +namespace backward { +inline DEVICE float activation(float a, float b, int index) { + return kActGradFloat[index](a, b); +} + +inline DEVICE double activation(double a, double b, int index) { + return kActGradDouble[index](a, b); +} +} // namespace backward + +#ifdef __AVX__ +namespace forward { +namespace avx { +__m256 Relu(const __m256 a); +__m256 Sigmoid(const __m256 a); +__m256 Tanh(const __m256 a); +__m256 Identity(const __m256 a); +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { +__m256 Relu(const __m256 a, const __m256 b); +__m256 Sigmoid(const __m256 a, const __m256 b); +__m256 Tanh(const __m256 a, const __m256 b); +__m256 Identity(const __m256 a, const __m256 b); +} // namespace avx +} // namespace backward + +static Active<__m256>::Act kActAvx[] = { + &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh, + &forward::avx::Identity}; + +static Active<__m256>::ActGrad kActGradAvx[] = { + &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh, + &backward::avx::Identity}; + +namespace forward { +inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); } +} // namespace forward + +namespace backward { +inline __m256 activation(__m256 a, __m256 b, int index) { + return kActGradAvx[index](a, b); +} +} // namespace backward + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc new file mode 100644 index 0000000000000000000000000000000000000000..838cd30e3d503ddb4734f1114e741fd40b1939c0 --- /dev/null +++ b/paddle/fluid/operators/math/detail/avx_functions.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef __AVX__ + +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +// TODO(qingqing) refine this dependence +#include "paddle/cuda/src/avx_mathfun.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +__m256 Exp(__m256 a) { return exp256_ps(a); } + +namespace forward { +namespace avx { +__m256 Relu(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} + +__m256 Sigmoid(const __m256 a) { + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 tmp = _mm256_max_ps(a, min); + tmp = _mm256_min_ps(tmp, max); + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = Exp(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} + +__m256 Tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); + tmp = Exp(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} + +__m256 Identity(const __m256 a) { return a; } + +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { +__m256 Relu(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), + _mm256_set1_ps(1.0f))); +} + +__m256 Sigmoid(const __m256 a, const __m256 b) { + return _mm256_mul_ps(_mm256_mul_ps(a, b), + _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); +} + +__m256 Tanh(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); +} + +__m256 Identity(const __m256 a, const __m256 b) { return a; } +} // namespace avx +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..75c5c8eb29a34047a22779edcc0fc2f5fbcbab6f --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -0,0 +1,426 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + ActivationType active_gate) { + T r_value_update_gate; + T r_value_reset_gate; + T r_value_reset_output; + T r_prev_out = 0; + T *update_gate = gate_value; + T *reset_gate = gate_value + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + reset_output_value[i] = r_value_reset_output; + } +} + +template +void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + ActivationType active_node) { + T r_value_update_gate; + T r_value_frame_state; + T r_prev_out = 0; + T r_output; + T *update_gate = gate_value; + T *frame_state = gate_value + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + frame_state[i] = r_value_frame_state; + output_value[i] = r_output; + } +} + +template +void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + ActivationType active_gate) { +#ifdef __AVX__ + __m256 r_value_update_gate; + __m256 r_value_reset_gate; + __m256 r_value_reset_output; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 *update_gate = (__m256 *)gate_value; + __m256 *reset_gate = (__m256 *)(gate_value + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + ((__m256 *)reset_output_value)[i] = r_value_reset_output; + } +#endif +} + +template +void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + ActivationType active_node) { +#ifdef __AVX__ + __m256 r_value_update_gate; + __m256 r_value_frame_state; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 r_output; + __m256 *update_gate = (__m256 *)gate_value; + __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + frame_state[i] = r_value_frame_state; + ((__m256 *)output_value)[i] = r_output; + } +#endif +} + +template +inline void forward_reset_output(OpResetOutput op_reset_output, + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_forward_reset_output( + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); + } else { + hl_naive_gru_forward_reset_output( + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); + } + + value.gate_value += frame_size * 3; + value.reset_output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + } +} + +template +inline void forward_final_output(OpFinalOutput op_final_output, + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_forward_final_output(op_final_output, value.gate_value, + value.prev_out_value, value.output_value, + frame_size, active_node); + } else { + hl_naive_gru_forward_final_output( + op_final_output, value.gate_value, value.prev_out_value, + value.output_value, frame_size, active_node); + } + + value.gate_value += frame_size * 3; + value.output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + } +} + +template +void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, + ActivationType active_node) { + T r_update_gate_value; + T r_update_gate_grad; + T r_frame_state_value; + T r_frame_state_grad; + T r_out_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *frame_state_value = gate_value + frame_size * 2; + T *frame_state_grad = gate_grad + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = output_grad[i]; + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; + } + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; + } + } +} + +template +void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, + ActivationType active_gate) { + T r_update_gate_value; + T r_update_gate_grad; + T r_reset_gate_value; + T r_reset_gate_grad; + T r_reset_output_grad = 0; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *reset_gate_value = gate_value + frame_size; + T *reset_gate_grad = gate_grad + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = reset_output_grad[i]; + } + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; + } + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; + } + } +} + +template +void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, + ActivationType active_node) { +#ifdef __AVX__ + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_frame_state_value; + __m256 r_frame_state_grad; + __m256 r_out_grad; + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); + __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = ((__m256 *)output_grad)[i]; + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; + } + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + } + } +#endif +} + +template +void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, + ActivationType active_gate) { +#ifdef __AVX__ + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_reset_gate_value; + __m256 r_reset_gate_grad; + __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); + __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; + } + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; + } + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + } + } +#endif +} + +template +inline void backward_state_grad(OpStateGrad op_state_grad, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_backward_state_grad( + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); + } else { + hl_naive_gru_backward_state_grad( + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); + } + + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + + grad.gate_grad += frame_size * 3; + grad.output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; + } + } +} + +template +inline void backward_reset_grad(OpResetGrad op_reset_grad, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_backward_reset_grad( + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + } else { + hl_naive_gru_backward_reset_grad( + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + } + + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; + } + + grad.gate_grad += frame_size * 3; + grad.reset_output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; + } + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fbf69d4a85883a68f137945ed8978acb9108b77b --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h @@ -0,0 +1,201 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + int batch_size, + ActivationType active_gate) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + reset_output_value += batch_idx * frame_size; + } + + T r_prev_out = 0; + T r_value_reset_output; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_reset_gate = gate_value[frame_idx + frame_size * 1]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; + } + + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); + + gate_value[frame_idx + frame_size * 0] = r_value_update_gate; + gate_value[frame_idx + frame_size * 1] = r_value_reset_gate; + reset_output_value[frame_idx] = r_value_reset_output; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + int batch_size, + ActivationType active_node) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + output_value += batch_idx * frame_size; + } + + T r_output; + T r_prev_out = 0; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_frame_state = gate_value[frame_idx + frame_size * 2]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; + } + + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); + + gate_value[frame_idx + frame_size * 2] = r_value_frame_state; + output_value[frame_idx] = r_output; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, int batch_size, + ActivationType active_node) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + output_grad += batch_idx * frame_size; + } + + T r_update_gate_grad; + T r_frame_state_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_frame_state_value = gate_value[frame_idx + frame_size * 2]; + T r_out_grad = output_grad[frame_idx]; + + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; + + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_grad = prev_out_grad[frame_idx]; + } + + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); + + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; + } +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, int batch_size, + ActivationType active_gate) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + reset_output_grad += batch_idx * frame_size; + } + + T r_reset_gate_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_reset_output_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0]; + T r_reset_gate_value = gate_value[frame_idx + frame_size * 1]; + + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; + r_prev_out_grad = prev_out_grad[frame_idx]; + r_reset_output_grad = reset_output_grad[frame_idx]; + } + + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); + + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; + } +} +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..705787e2ff76630fbcb23e646c91f74fa2feea24 --- /dev/null +++ b/paddle/fluid/operators/math/detail/gru_kernel.h @@ -0,0 +1,163 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include + +// TODO(guosheng): refine code style in gru_kernel +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +namespace forward { + +template +class gru_resetOutput { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, + T &prev_out, T &value_reset_output, + ActivationType act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = prev_out * value_reset_gate; + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_reset_gate, __m256 &prev_out, + __m256 &value_reset_output, + ActivationType act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); + } +#endif +#endif +}; + +template +class gru_finalOutput { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, + T &prev_out, T &value_output, + ActivationType act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = prev_out - (value_update_gate * prev_out) + + (value_update_gate * value_frame_state); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_frame_state, __m256 &prev_out, + __m256 &value_output, ActivationType act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = _mm256_add_ps( + _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), + _mm256_mul_ps(value_update_gate, value_frame_state)); + } +#endif +#endif +}; +} // namespace forward + +namespace backward { + +template +class gru_stateGrad { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_frame_state, T &grad_frame_state, + T &value_prev_out, T &grad_prev_out, + T &grad_output, ActivationType act_input) { + grad_update_gate = (grad_output * value_frame_state); + grad_update_gate -= (grad_output * value_prev_out); + grad_prev_out -= (grad_output * value_update_gate); + grad_prev_out += grad_output; + grad_frame_state = activation(grad_output * value_update_gate, + value_frame_state, act_input); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, + __m256 &value_frame_state, + __m256 &grad_frame_state, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_output, + ActivationType act_input) { + grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); + grad_update_gate = _mm256_sub_ps( + grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); + grad_prev_out = _mm256_add_ps( + _mm256_sub_ps(grad_prev_out, + _mm256_mul_ps(grad_output, value_update_gate)), + grad_output); + grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate), + value_frame_state, act_input); + } +#endif +#endif +}; + +template +class gru_resetGrad { + public: + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_reset_gate, T &grad_reset_gate, + T &value_prev_out, T &grad_prev_out, + T &grad_reset_output, ActivationType act_gate) { + grad_reset_gate = (grad_reset_output * value_prev_out); + grad_prev_out += (grad_reset_output * value_reset_gate); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); + } +#ifndef __NVCC__ +#ifndef __AVX__ + static const bool avx = false; +#else + static const bool avx = true; + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, __m256 &value_reset_gate, + __m256 &grad_reset_gate, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_reset_output, + ActivationType act_gate) { + grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); + grad_prev_out = _mm256_add_ps( + grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bf26509ba17774f55c4f7592ff3afb7bcfcaa336 --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -0,0 +1,312 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_checkI; + T r_checkF; + T r_checkO; + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + value.state_value[i] = r_state; + value.state_active_value[i] = r_state_atv; + value.output_value[i] = r_out; + } +} + +template +void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI; + T r_checkF; + T r_checkO; + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + T *grad_in = grad.gate_grad; + T *grad_ig = grad.gate_grad + frame_size; + T *grad_fg = grad.gate_grad + frame_size * 2; + T *grad_og = grad.gate_grad + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + r_state = value.state_value[i]; + r_state_atv = value.state_active_value[i]; + r_output_grad = grad.output_grad[i]; + r_state_grad = grad.state_grad[i]; + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + grad.state_grad[i] = r_state_grad; + + if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad; + } + if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad; + } +} + +template +void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_state; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_state_atv; + __m256 r_out; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; + } + + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + ((__m256 *)value.state_value)[i] = r_state; + ((__m256 *)value.state_active_value)[i] = r_state_atv; + ((__m256 *)value.output_value)[i] = r_out; + } +#endif +} + +template +void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_grad_in; + __m256 r_grad_ig; + __m256 r_grad_fg; + __m256 r_grad_og; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_prev_state_grad; + __m256 r_state_grad; + __m256 r_state; + __m256 r_state_atv; + __m256 r_output_grad; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_checkIGrad; + __m256 r_checkFGrad; + __m256 r_checkOGrad; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + __m256 *grad_in = (__m256 *)grad.gate_grad; + __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); + __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); + __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; + } + r_state = ((__m256 *)value.state_value)[i]; + r_state_atv = ((__m256 *)value.state_active_value)[i]; + r_output_grad = ((__m256 *)grad.output_grad)[i]; + r_state_grad = ((__m256 *)grad.state_grad)[i]; + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + ((__m256 *)grad.state_grad)[i] = r_state_grad; + + if (grad.prev_state_grad) + ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; + if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; + } + if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; + } +#endif +} + +template +void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_forward_one_sequence(op, value, frame_size, active_node, + active_gate, active_state); + } else { + naive_lstm_forward_one_sequence(op, value, frame_size, active_node, + active_gate, active_state); + } +} + +template +void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_backward_one_sequence(op, value, grad, frame_size, active_node, + active_gate, active_state); + } else { + naive_lstm_backward_one_sequence(op, value, grad, frame_size, + active_node, active_gate, active_state); + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7865d0c0ba12c6150d87ea22e9c597e90b57e1ba --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/device_context.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.output_value += batch_idx * frame_size; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; + } + + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate, + active_state); + + value.gate_value[frame_idx] = r_value_in; + value.gate_value[frame_idx + frame_size] = r_value_ig; + value.gate_value[frame_idx + frame_size * 2] = r_value_fg; + value.gate_value[frame_idx + frame_size * 3] = r_value_og; + + value.state_value[frame_idx] = r_state; + value.state_active_value[frame_idx] = r_state_atv; + value.output_value[frame_idx] = r_out; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void KeLstmBackward(Op op, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; + grad.gate_grad += batch_idx * frame_size * 4; + grad.state_grad += batch_idx * frame_size; + grad.output_grad += batch_idx * frame_size; + } + + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + r_state = value.state_value[frame_idx]; + r_state_atv = value.state_active_value[frame_idx]; + r_output_grad = grad.output_grad[frame_idx]; + r_state_grad = grad.state_grad[frame_idx]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; + } + + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad.gate_grad[frame_idx] = r_grad_in; + grad.gate_grad[frame_idx + frame_size] = r_grad_ig; + grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg; + grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og; + grad.state_grad[frame_idx] = r_state_grad; + if (grad.prev_state_grad) { + if (is_batch) grad.prev_state_grad += batch_idx * frame_size; + grad.prev_state_grad[frame_idx] = r_prev_state_grad; + } + + if (is_batch) { + if (value.prev_state_value) { + if (grad.check_ig_grad) + paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx, + r_checkIGrad); + if (grad.check_fg_grad) + paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx, + r_checkFGrad); + } + if (grad.check_og_grad) + paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx, + r_checkOGrad); + } else { + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad; + } + if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad; + } +} + +template +void gpu_lstm_forward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, int frame_size, int batch_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + /* frame_per_block = 32 batch_per_block = 32 */ + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batch_size == 1) { + KeLstmForward<<>>( + op, value, frame_size, batch_size, active_node, active_gate, + active_state); + } else { + KeLstmForward<<>>( + op, value, frame_size, batch_size, active_node, active_gate, + active_state); + } +} + +template +void gpu_lstm_backward(const platform::DeviceContext& context, Op op, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + /* frame_per_block = 32 batch_per_block = 16 */ + threads = dim3(32, 16); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16); + } + + auto stream = + reinterpret_cast(context).stream(); + if (batch_size == 1) { + KeLstmBackward<<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, + active_state); + } else { + KeLstmBackward<<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, + active_state); + } +} + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0679cc62ba91fce540d9f6e227729a96a1553173 --- /dev/null +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +namespace forward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &prev_state, T &state, T &state_atv, T &output, + T &checkI, T &checkF, T &checkO, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + value_in = activation(value_in, active_node); + value_ig = activation(value_ig + prev_state * checkI, active_gate); + value_fg = activation(value_fg + prev_state * checkF, active_gate); + state = value_in * value_ig + prev_state * value_fg; + value_og = activation(value_og + state * checkO, active_gate); + state_atv = activation(state, active_state); + output = value_og * state_atv; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + + HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig, + __m256 &value_fg, __m256 &value_og, + __m256 &prev_state, __m256 &state, + __m256 &state_atv, __m256 &output, __m256 &checkI, + __m256 &checkF, __m256 &checkO, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + value_in = activation(value_in, active_node); + value_ig = + activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), + active_gate); + value_fg = + activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)), + active_gate); + state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig), + _mm256_mul_ps(prev_state, value_fg)); + value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)), + active_gate); + state_atv = activation(state, active_state); + output = _mm256_mul_ps(value_og, state_atv); + } +#endif +#endif +}; + +} // namespace forward + +namespace backward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &grad_in, T &grad_ig, T &grad_fg, T &grad_og, + T &prev_state, T &prev_state_grad, T &state, + T &state_grad, T &state_atv, T &output_grad, + T &checkI, T &checkF, T &checkO, T &checkIGrad, + T &checkFGrad, T &checkOGrad, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + grad_og = activation(output_grad * state_atv, value_og, active_gate); + state_grad += activation(output_grad * value_og, state_atv, active_state) + + grad_og * checkO; + grad_in = activation(state_grad * value_ig, value_in, active_node); + grad_ig = activation(state_grad * value_in, value_ig, active_gate); + grad_fg = activation(state_grad * prev_state, value_fg, active_gate); + prev_state_grad = + grad_ig * checkI + grad_fg * checkF + state_grad * value_fg; + checkIGrad = grad_ig * prev_state; + checkFGrad = grad_fg * prev_state; + checkOGrad = grad_og * state; + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + HOSTDEVICE void operator()( + __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og, + __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og, + __m256 &prev_state, __m256 &prev_state_grad, __m256 &state, + __m256 &state_grad, __m256 &state_atv, __m256 &output_grad, + __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, + __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { + grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, + active_gate); + state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), + state_atv, active_state), + state_grad); + state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad); + grad_in = + activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node); + grad_ig = + activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate); + grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg, + active_gate); + prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI), + _mm256_mul_ps(grad_fg, checkF)); + prev_state_grad = + _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad); + checkIGrad = _mm256_mul_ps(grad_ig, prev_state); + checkFGrad = _mm256_mul_ps(grad_fg, prev_state); + checkOGrad = _mm256_mul_ps(grad_og, state); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h new file mode 100644 index 0000000000000000000000000000000000000000..13e5d406c11a10dc87533a2ca07d14f4684446f5 --- /dev/null +++ b/paddle/fluid/operators/math/detection_util.h @@ -0,0 +1,300 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct BBox { + BBox(T x_min, T y_min, T x_max, T y_max) + : x_min(x_min), + y_min(y_min), + x_max(x_max), + y_max(y_max), + is_difficult(false) {} + + BBox() {} + + T get_width() const { return x_max - x_min; } + + T get_height() const { return y_max - y_min; } + + T get_center_x() const { return (x_min + x_max) / 2; } + + T get_center_y() const { return (y_min + y_max) / 2; } + + T get_area() const { return get_width() * get_height(); } + + // coordinate of bounding box + T x_min; + T y_min; + T x_max; + T y_max; + // whether difficult object (e.g. object with heavy occlusion is difficult) + bool is_difficult; +}; +// KNCHW ==> NHWC +// template +template +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec); +template +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec); +template +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data); +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2); +template +bool SortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); +template +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2); + +template +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices); +template +int GetDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices); +template +BBox ClipBBox(const BBox& bbox); +template +void GetDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data); +template +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { + size_t out_offset = bbox_vec.size(); + bbox_vec.resize(bbox_vec.size() + num_bboxes); + for (size_t i = 0; i < num_bboxes; ++i) { + BBox bbox; + bbox.x_min = *(prior_data + i * 8); + bbox.y_min = *(prior_data + i * 8 + 1); + bbox.x_max = *(prior_data + i * 8 + 2); + bbox.y_max = *(prior_data + i * 8 + 3); + bbox_vec[out_offset + i] = bbox; + } +} +template +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec) { + size_t out_offset = var_vec.size(); + var_vec.resize(var_vec.size() + num); + for (size_t i = 0; i < num; ++i) { + std::vector var; + var.push_back(*(prior_data + i * 8 + 4)); + var.push_back(*(prior_data + i * 8 + 5)); + var.push_back(*(prior_data + i * 8 + 6)); + var.push_back(*(prior_data + i * 8 + 7)); + var_vec[out_offset + i] = var; + } +} +template +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { + T prior_bbox_width = prior_bbox.get_width(); + T prior_bbox_height = prior_bbox.get_height(); + T prior_bbox_center_x = prior_bbox.get_center_x(); + T prior_bbox_center_y = prior_bbox.get_center_y(); + + T decoded_bbox_center_x = + prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width + + prior_bbox_center_x; + T decoded_bbox_center_y = + prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height + + prior_bbox_center_y; + T decoded_bbox_width = + std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width; + T decoded_bbox_height = + std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height; + + BBox decoded_bbox; + decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2; + decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2; + decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2; + decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2; + + return decoded_bbox; +} +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} +template +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2) { + if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || + bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { + return 0.0; + } else { + T inter_x_min = std::max(bbox1.x_min, bbox2.x_min); + T inter_y_min = std::max(bbox1.y_min, bbox2.y_min); + T interX_max = std::min(bbox1.x_max, bbox2.x_max); + T interY_max = std::min(bbox1.y_max, bbox2.y_max); + + T inter_width = interX_max - inter_x_min; + T inter_height = interY_max - inter_y_min; + T inter_area = inter_width * inter_height; + + T bbox_area1 = bbox1.get_area(); + T bbox_area2 = bbox2.get_area(); + + return inter_area / (bbox_area1 + bbox_area2 - inter_area); + } +} + +template +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices) { + std::vector> scores; + for (size_t i = 0; i < num_priors; ++i) { + size_t conf_offset = i * num_classes + class_idx; + if (conf_score_data[conf_offset] > conf_threshold) + scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); + } + std::stable_sort(scores.begin(), scores.end(), + SortScorePairDescend); + if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); + while (scores.size() > 0) { + const size_t idx = scores.front().second; + bool keep = true; + for (size_t i = 0; i < indices->size(); ++i) { + if (keep) { + const size_t saved_idx = (*indices)[i]; + T overlap = jaccard_overlap(bboxes[idx], bboxes[saved_idx]); + keep = overlap <= nms_threshold; + } else { + break; + } + } + if (keep) indices->push_back(idx); + scores.erase(scores.begin()); + } +} +template +int GetDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices) { + int total_keep_num = 0; + for (size_t n = 0; n < batch_size; ++n) { + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + size_t num_detected = 0; + std::map> indices; + size_t conf_offset = n * num_priors * num_classes; + for (size_t c = 0; c < num_classes; ++c) { + if (c == background_label_id) continue; + ApplyNmsFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); + num_detected += indices[c].size(); + } + if (top_k > 0 && num_detected > top_k) { + // std::vector> score_index_pairs; + std::vector>> score_index_pairs; + for (size_t c = 0; c < num_classes; ++c) { + const std::vector& label_indices = indices[c]; + for (size_t i = 0; i < label_indices.size(); ++i) { + size_t idx = label_indices[i]; + score_index_pairs.push_back( + std::make_pair((conf_data + conf_offset)[idx * num_classes + c], + std::make_pair(c, idx))); + } + } + std::sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(top_k); + std::map> new_indices; + for (size_t i = 0; i < score_index_pairs.size(); ++i) { + size_t label = score_index_pairs[i].second.first; + size_t idx = score_index_pairs[i].second.second; + new_indices[label].push_back(idx); + } + all_detection_indices->push_back(new_indices); + total_keep_num += top_k; + } else { + all_detection_indices->push_back(indices); + total_keep_num += num_detected; + } + } + return total_keep_num; +} +template +BBox ClipBBox(const BBox& bbox) { + T one = static_cast(1.0); + T zero = static_cast(0.0); + BBox clipped_bbox; + clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero); + clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero); + clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero); + clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero); + return clipped_bbox; +} +template +void GetDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data) { + size_t count = 0; + for (size_t n = 0; n < batch_size; ++n) { + for (std::map>::const_iterator it = + all_indices[n].begin(); + it != all_indices[n].end(); ++it) { + size_t label = it->first; + const std::vector& indices = it->second; + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + size_t conf_offset = n * num_priors * num_classes + idx * num_classes; + out_data[count * 7] = n; + out_data[count * 7 + 1] = label; + out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; + BBox clipped_bbox = ClipBBox(decoded_bboxes[idx]); + out_data[count * 7 + 3] = clipped_bbox.x_min; + out_data[count * 7 + 4] = clipped_bbox.y_min; + out_data[count * 7 + 5] = clipped_bbox.x_max; + out_data[count * 7 + 6] = clipped_bbox.y_max; + ++count; + } + } + } +} +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..100318041679e38b37fd1ef1f071d4e682889756 --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUUnitFunctor { + static void compute(const platform::CPUDeviceContext &context, + GRUMetaValue value, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { +#ifndef __NVCC__ + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); + } + + detail::forward_reset_output(detail::forward::gru_resetOutput(), value, + frame_size, batch_size, active_gate); + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); + } + + detail::forward_final_output(detail::forward::gru_finalOutput(), value, + frame_size, batch_size, active_node); +#endif + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const platform::CPUDeviceContext &context, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { +#ifndef __NVCC__ + detail::backward_state_grad(detail::backward::gru_stateGrad(), value, + grad, frame_size, batch_size, active_node); + + if (value.prev_out_value && grad.prev_out_grad) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); + + if (grad.state_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); + } + } + + detail::backward_reset_grad(detail::backward::gru_resetGrad(), value, + grad, frame_size, batch_size, active_gate); + + if (grad.prev_out_grad && value.prev_out_value) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); + + if (grad.gate_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); + } + } +#endif + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d5d5d7a743150c397a5b8356d1b3add88c509b6 --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUUnitFunctor { + static void compute(const platform::CUDADeviceContext &context, + GRUMetaValue value, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardResetOutput, + /* is_batch= */ false, + T><<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); + } else { + detail::KeGruForwardResetOutput, + /* is_batch= */ true, + T><<>>( + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); + } + + if (value.prev_out_value) { + math::gemm( + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardFinalOutput, + /* is_batch= */ false, + T><<>>( + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, + active_node); + } else { + detail::KeGruForwardFinalOutput, + /* is_batch= */ true, + T><<>>( + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, + active_node); + } + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const platform::CUDADeviceContext &context, + GRUMetaValue value, GRUMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (batch_size == 1) { + detail::KeGruBackwardStateGrad< + detail::backward::gru_stateGrad, + /* is_batch= */ false><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); + } else { + detail::KeGruBackwardStateGrad< + detail::backward::gru_stateGrad, + /* is_batch= */ true><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); + } + + if (value.prev_out_value && grad.prev_out_grad) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); + + if (grad.state_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); + } + } + + if (batch_size == 1) { + detail::KeGruBackwardResetGrad< + detail::backward::gru_resetGrad, + /* is_batch= */ false><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); + } else { + detail::KeGruBackwardResetGrad< + detail::backward::gru_resetGrad, + /* is_batch= */ true><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); + } + + if (grad.prev_out_grad && value.prev_out_value) { + math::gemm( + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); + + if (grad.gate_weight_grad) { + math::gemm( + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); + } + } + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..93e19cf55782facfd95affdc77dcf78c511d8bbd --- /dev/null +++ b/paddle/fluid/operators/math/gru_compute.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct GRUMetaValue { + T *gate_weight; + T *state_weight; + T *gate_value; + T *reset_output_value; + T *output_value; + T *prev_out_value; +}; + +template +struct GRUMetaGrad { + T *gate_weight_grad; + T *state_weight_grad; + T *gate_grad; + T *reset_output_grad; + T *output_grad; + T *prev_out_grad; +}; + +template +struct GRUUnitFunctor { + static void compute(const DeviceContext &context, GRUMetaValue value, + int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate); +}; + +template +struct GRUUnitGradFunctor { + static void compute(const DeviceContext &context, GRUMetaValue value, + GRUMetaGrad grad, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc new file mode 100644 index 0000000000000000000000000000000000000000..c298b00bb4cb9df8f9a54b4420edb07aed9cf891 --- /dev/null +++ b/paddle/fluid/operators/math/im2col.cc @@ -0,0 +1,313 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int col_height = col->dims()[3]; + int col_width = col->dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + ((dilation[0] * (filter_height - 1) + 1))) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + ((dilation[1] * (filter_width - 1) + 1))) / + stride[1] + + 1, + col_width, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < col_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * col_height + h) * col_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int col_height = col.dims()[3]; + int col_width = col.dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + ((dilation[0] * (filter_height - 1) + 1))) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + ((dilation[1] * (filter_width - 1) + 1))) / + stride[1] + + 1, + col_width, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + + int channels_col = im_channels * filter_height * filter_width; + + T* im_data = im->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < col_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < col_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + if ((im_row_idx) >= 0 && (im_row_idx) < im_height && + (im_col_idx) >= 0 && (im_col_idx) < im_width) { + im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] += + col_data[(c * col_height + h) * col_width + w]; + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[3]; + int filter_width = col->dims()[4]; + int col_height = col->dims()[0]; + int col_width = col->dims()[1]; + + PADDLE_ENFORCE_EQ( + (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ( + (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + const T* im_data = im.data(); + T* col_data = col->data(); + + for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { + for (int channel = 0; channel < im_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + int im_row_offset = + col_row_idx * stride[0] + filter_row_idx - padding[0]; + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_col_offset = + col_col_idx * stride[1] + filter_col_idx - padding[1]; + + int col_offset = + ((((col_row_idx)*col_width + col_col_idx) * im_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + + int im_offset = (channel * im_height + im_row_offset) * im_width + + im_col_offset; + col_data[col_offset] = + (im_row_offset < 0 || im_row_offset >= im_height || + im_col_offset < 0 || im_col_offset >= im_width) + ? static_cast(0) + : im_data[im_offset]; + } + } + } + } + } + } +}; + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int col_height = col.dims()[0]; + int col_width = col.dims()[1]; + + PADDLE_ENFORCE_EQ( + (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ( + (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + T* im_data = im->data(); + const T* col_data = col.data(); + + for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { + for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { + for (int channel = 0; channel < im_channels; ++channel) { + for (int filter_row_idx = 0; filter_row_idx < filter_height; + ++filter_row_idx) { + int im_row_offset = + col_row_idx * stride[0] + filter_row_idx - padding[0]; + for (int filter_col_idx = 0; filter_col_idx < filter_width; + ++filter_col_idx) { + int im_col_offset = + col_col_idx * stride[1] + filter_col_idx - padding[1]; + + int col_offset = + (((col_row_idx * col_width + col_col_idx) * im_channels + + channel) * + filter_height + + filter_row_idx) * + filter_width + + filter_col_idx; + + if (im_row_offset >= 0 && im_row_offset < im_height && + im_col_offset >= 0 && im_col_offset < im_width) { + int im_offset = + (channel * im_height + im_row_offset) * im_width + + im_col_offset; + im_data[im_offset] += col_data[col_offset]; + } + } + } + } + } + } + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu new file mode 100644 index 0000000000000000000000000000000000000000..c26343aacf524c4381a8bba1e4e0d1a07bee6d6e --- /dev/null +++ b/paddle/fluid/operators/math/im2col.cu @@ -0,0 +1,424 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void im2col(const T* data_im, int num_outs, int im_height, + int im_width, int dilation_h, int dilation_w, + int filter_height, int filter_width, int stride_height, + int stride_width, int padding_height, int padding_width, + int col_height, int col_width, T* data_col) { + const int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + if (index < num_outs) { + int w_out = index % col_width; + int h_out = (index / col_width) % col_height; + int channel_in = index / col_width / col_height; + int channel_out = channel_in * filter_height * filter_width; + int h_in = h_out * stride_height - padding_height; + int w_in = w_out * stride_width - padding_width; + + data_col += (channel_out * col_height + h_out) * col_width + w_out; + data_im += (channel_in * im_height + h_in) * im_width + w_in; + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int rIdx = h_in + i * dilation_h; + int cIdx = w_in + j * dilation_w; + *data_col = + (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0) + ? 0 + : data_im[i * dilation_h * im_width + j * dilation_w]; + data_col += col_height * col_width; + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int col_height = col->dims()[3]; + int col_width = col->dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int num_outputs = im_channels * col_height * col_width; + int blocks = (num_outputs + 1024 - 1) / 1024; + int block_x = 512; + int block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + im2col<<>>( + im.data(), num_outputs, im_height, im_width, dilation[0], + dilation[1], filter_height, filter_width, stride[0], stride[1], + padding[0], padding[1], col_height, col_width, col->data()); + } +}; + +template +__global__ void col2im(int n, const T* data_col, int im_height, int im_width, + int dilation_h, int dilation_w, int filter_height, + int filter_width, int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* data_im) { + const int index = + (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + const int d_filter_height = dilation_h * (filter_height - 1) + 1; + const int d_filter_width = dilation_w * (filter_width - 1) + 1; + + if (index < n) { + T val = 0; + int w = index % im_width + padding_width; + int h = (index / im_width) % im_height + padding_height; + int c = index / (im_width * im_height); + + // compute the start and end of the output + int w_col_start = + (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; + int w_col_end = min(w / stride_width + 1, col_width); + int h_col_start = + (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; + int h_col_end = min(h / stride_height + 1, col_height); + + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + int h_off = (h - h_col * stride_height); + int w_off = (w - w_col * stride_width); + if (h_off % dilation_h == 0 && w_off % dilation_w == 0) { + h_off /= dilation_h; + w_off /= dilation_w; + int data_col_index = + (((c * filter_height + h_off) * filter_width + w_off) * + col_height + + h_col) * + col_width + + w_col; + + val += data_col[data_col_index]; + } + } + } + data_im[index] = val; + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [input_channels, filter_height, filter_width, output_height, output_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[1]; + int filter_width = col.dims()[2]; + int col_height = col.dims()[3]; + int col_width = col.dims()[4]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + size_t num_kernels = im_channels * im_height * im_width; + + size_t blocks = (num_kernels + 1024 - 1) / 1024; + size_t block_x = 512; + size_t block_y = (blocks + 512 - 1) / 512; + dim3 threads(1024, 1); + dim3 grid(block_x, block_y); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + col2im<<>>( + num_kernels, col.data(), im_height, im_width, dilation[0], + dilation[1], filter_height, filter_width, stride[0], stride[1], + padding[0], padding[2], col_height, col_width, im->data()); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +template +__global__ void im2colOCF(const T* im_data, int im_channels, int im_height, + int im_width, int filter_height, int filter_width, + int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* col_data) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < im_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); + + col_data[col_offset] = + (height_offset >= im_height || height_offset < 0 || + width_offset >= im_width || width_offset < 0) + ? T(0) + : im_data[im_offset]; + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Im2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& im, const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col) { + PADDLE_ENFORCE(im.dims().size() == 3); + PADDLE_ENFORCE(col->dims().size() == 5); + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[3]; + int filter_width = col->dims()[4]; + int col_height = col->dims()[0]; + int col_width = col->dims()[1]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); + dim3 grid(col_width, col_height); + im2colOCF<<>>( + im.data(), im_channels, im_height, im_width, filter_height, + filter_width, stride[0], stride[1], padding[0], padding[1], col_height, + col_width, col->data()); + } +}; + +template +__global__ void col2imOCF(const T* col_data, int im_channels, int im_height, + int im_width, int filter_height, int filter_width, + int stride_height, int stride_width, + int padding_height, int padding_width, int col_height, + int col_width, T* im_data) { + int swid = blockIdx.x; + int shid = blockIdx.y; + for (int channelid = threadIdx.z; channelid < im_channels; + channelid += blockDim.z) { + for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { + for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { + int width_offset = idx + swid * stride_width - padding_width; + int height_offset = idy + shid * stride_height - padding_height; + int im_offset = width_offset + height_offset * im_width + + channelid * im_height * im_width; + + int col_offset = idx + idy * filter_width + + channelid * filter_height * filter_width + + (shid * col_width + swid) * + (im_channels * filter_height * filter_width); + + if (height_offset >= 0 && height_offset < im_height && + width_offset >= 0 && width_offset < im_width) { + paddle::platform::CudaAtomicAdd(im_data + im_offset, + col_data[col_offset]); + } + } + } + } +} + +/* + * im = [input_channels, input_height, input_width] + * col = + * [output_height, output_width, input_channels, filter_height, filter_width] + */ +template +class Col2ImFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im) { + PADDLE_ENFORCE(im->dims().size() == 3); + PADDLE_ENFORCE(col.dims().size() == 5); + int im_channels = im->dims()[0]; + int im_height = im->dims()[1]; + int im_width = im->dims()[2]; + int filter_height = col.dims()[3]; + int filter_width = col.dims()[4]; + int col_height = col.dims()[0]; + int col_width = col.dims()[1]; + + PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - + (dilation[0] * (filter_height - 1) + 1)) / + stride[0] + + 1, + col_height, + "Output_height and padding(padding_up, padding_down) are " + "inconsistent."); + PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - + (dilation[1] * (filter_width - 1) + 1)) / + stride[1] + + 1, + col_width, + "col_width and padding(padding_left, padding_right) are " + "inconsistent."); + + int block_dim_x = 0; + int block_dim_y = 0; + if (filter_height <= 4 && filter_width <= 4) { + block_dim_x = 4; + block_dim_y = 4; + } else if (filter_height <= 8 && filter_width <= 8) { + block_dim_x = 8; + block_dim_y = 8; + } else if (filter_height <= 16 && filter_width <= 16) { + block_dim_x = 16; + block_dim_y = 16; + } else { + block_dim_x = 32; + block_dim_y = 32; + } + + int block_dim_z = 1024 / block_dim_x / block_dim_y; + dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels)); + dim3 grid(col_width, col_height); + col2imOCF<<>>( + col.data(), im_channels, im_height, im_width, filter_height, + filter_width, stride[0], stride[1], padding[0], padding[1], col_height, + col_width, im->data()); + } +}; + +template class Im2ColFunctor; +template class Im2ColFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h new file mode 100644 index 0000000000000000000000000000000000000000..525c0f5dda102ef92a9d79832fecc10f99ccd900 --- /dev/null +++ b/paddle/fluid/operators/math/im2col.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ +enum class ColFormat { kCFO = 0, kOCF = 1 }; + +/* + * \brief Converts the image data of three dimensions(CHW) into a colData of + * five dimensions in the Im2ColFunctor calculation, + * And in the Col2ImFunctor calculation, it is reversed. + * + * \param imData Image data. + * \param imShape The shape of imData, + * [input_channels, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * \param dilations dilation data. + * \param 2-dimension [dilation_height, dilation_width]. + * + * \param strides stride data. + * \param 2-dimension [stride_height, stride_width]. + * + * \param paddings padding data. + * \param 4-dimension [up_pad, left_pad, down_pad, right_pad]. + * + * If the template argument Format is kCFO, the shape of colData is: + * [input_channels, filter_height, filter_width, output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_height * filter_width, and the width is equal + * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_height, + * filter_width, ======> [height, width] + * output_height, + * output_width] + * + * If the template argument Format is kOCF, the shape of colData is: + * [output_height, output_width, input_channels, filter_height, filter_width] + * So, it is easy to reshape into a sequence matrix for rnn calculation. + * The shape of sequence matrix is [seq_length, step_size], where the seq_length + * is equal output_height * output_width, and the step_size is equal + * input_channels * filter_height * filter_width. + * + * Reshape: + * shape of colData shape of sequence matrix + * [output_height, + * output_width, + * input_channels, ======> [seqLength, stepSize] + * filter_height, + * filter_width] + * + * \note The caller needs to ensure that imShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Im2ColFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* col); +}; + +template +class Col2ImFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& col, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, framework::Tensor* im); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..59d6a84b892fc32aceb5622a856a5c648a3ade5b --- /dev/null +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/im2col.h" +#include + +template +void testIm2col() { + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor input; + paddle::framework::Tensor output_cfo; + paddle::framework::Tensor output_ocf; + paddle::framework::Tensor output_tmp; + + /** + * input = [0, 1, 2, + * 3, 4, 5] + * + * output_cfo = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5] + * + * output_ocf = [0, 1, 3, 4 + * 1, 2, 4, 5] + * + * col2im_cfo = [0, 2, 2 + * 3, 4, 5] + * + * col2im_ocf = [0, 2, 2 + * 3, 4, 5] + */ + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector stride({1, 1}); // stride_y, stride_x + std::vector padding( + {0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad + std::vector dilation({1, 1}); // dilation_y, dilation_x + int output_height = + (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1; + int output_width = + (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1; + float* input_ptr = input_tmp.mutable_data( + {1, input_height, input_width}, paddle::platform::CPUPlace()); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input_ptr, arr, 6 * sizeof(float)); + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + output_cfo.mutable_data( + {1, filter_size, filter_size, output_height, output_width}, *place); + output_ocf.mutable_data( + {output_height, output_width, 1, filter_size, filter_size}, *place); + + // Im2Col + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> + im2col; + paddle::operators::math::Im2ColFunctor< + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> + im2col_ocf; + + im2col(*context, input, dilation, stride, padding, &output_cfo); + im2col_ocf(*context, input, dilation, stride, padding, &output_ocf); + + float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; + float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; + + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output_cfo.data(); + } else { + Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]); + } + + float* out_ocf_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_ocf_ptr = output_ocf.data(); + } else { + Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); + out_ocf_ptr = output_tmp.data(); + } + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); + } + + // Col2Im: kCFO + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> + col2im; + paddle::operators::math::Col2ImFunctor< + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> + col2im_ocf; + float col2im_data[] = {0, 2, 2, 3, 8, 5}; + + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + col2im(*context, output_cfo, dilation, stride, padding, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + // Col2Im: kOCF + memset(input_ptr, 0, 6 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); + + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(in_ptr[i], col2im_data[i]); + } + + delete place; + delete context; +} + +TEST(math, im2col) { + testIm2col(); +#ifdef PADDLE_WITH_CUDA + testIm2col(); +#endif +} diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..09eb89ec58d107f547a5c83908a9d2a541aa95f4 --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/lstm_kernel.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::CPUDeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, + cand_act, gate_act, cell_act); + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::CPUDeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_backward(detail::backward::lstm(), value, grad, + frame_size, cand_act, gate_act, cell_act); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + + grad.gate_grad += frame_size * 4; + grad.state_grad += frame_size; + grad.state_active_grad += frame_size; + grad.output_grad += frame_size; + if (grad.prev_state_grad) { + grad.prev_state_grad += frame_size; + } + } + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..adedee28bd010c611b9c6901c55bf67e73d3639b --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h" +#include "paddle/fluid/operators/math/detail/lstm_kernel.h" +#include "paddle/fluid/operators/math/lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const platform::CUDADeviceContext& context, + LstmMetaValue value, int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + detail::gpu_lstm_forward(context, detail::forward::lstm(), value, + frame_size, batch_size, cand_act, gate_act, + cell_act); + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const platform::CUDADeviceContext& context, + LstmMetaValue value, LstmMetaGrad grad, + int frame_size, int batch_size, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, + frame_size, batch_size, cand_act, gate_act, + cell_act); + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/fluid/operators/math/lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8610e96cf1abb0e5a64cd60c6bab3a8c08754587 --- /dev/null +++ b/paddle/fluid/operators/math/lstm_compute.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct LstmMetaValue { + T *gate_value; + T *prev_state_value; + T *state_value; + T *state_active_value; + T *output_value; + T *check_ig; + T *check_fg; + T *check_og; +}; + +template +struct LstmMetaGrad { + T *gate_grad; + T *prev_state_grad; + T *state_grad; + T *state_active_grad; + T *output_grad; + T *check_ig_grad; + T *check_fg_grad; + T *check_og_grad; +}; + +template +class LstmUnitFunctor { + public: + static void compute(const DeviceContext &context, LstmMetaValue value, + int frame_size, int batch_size, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +template +class LstmUnitGradFunctor { + public: + static void compute(const DeviceContext &context, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, int batch_size, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc new file mode 100644 index 0000000000000000000000000000000000000000..2636dbddde67955a99663aa93df1425b9e1ec2ce --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cc @@ -0,0 +1,342 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { + cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); +} + +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { + cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); +} + +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && + platform::is_cpu_place(matrix_b.place()) && + platform::is_cpu_place(matrix_out->place()), + "Matrix must all be in CPUPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && + platform::is_cpu_place(matrix_b.place()) && + platform::is_cpu_place(matrix_out->place()), + "Matrix must all be in CPUPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +#ifdef PADDLE_WITH_MKLML +// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} + +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} +#else +// The below is a naive but correct serial implementation that just loops +// over the batch dimension. This is a fallback for when the batched gemm +// functions of Intel MKL are not available. In the future, this computation +// should be parallelized. +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const float* Ak = &A[k * strideA]; + const float* Bk = &B[k * strideB]; + float* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); + } +} + +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const double* Ak = &A[k * strideA]; + const double* Bk = &B[k * strideB]; + double* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); + } +} +#endif + +template <> +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); +} + +template <> +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); +} + +template <> +void axpy( + const platform::CPUDeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + cblas_saxpy(n, alpha, x, 1, y, 1); +} + +template <> +void axpy( + const platform::CPUDeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + cblas_daxpy(n, alpha, x, 1, y, 1); +} + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_CPU_TRANS(1); +DEFINE_CPU_TRANS(2); +DEFINE_CPU_TRANS(3); +DEFINE_CPU_TRANS(4); +DEFINE_CPU_TRANS(5); +DEFINE_CPU_TRANS(6); + +struct TensorSetConstantCPU { + TensorSetConstantCPU(framework::Tensor* tensor, float value) + : tensor_(tensor), value_(value) {} + template + void operator()() const { + auto cpu = platform::CPUPlace(); + auto* begin = tensor_->mutable_data(cpu); + std::fill(begin, begin + tensor_->numel(), static_cast(value_)); + } + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(framework::ToDataType(tensor->type()), + TensorSetConstantCPU(tensor, value)); +} + +struct TensorSetConstantWithPlace : public boost::static_visitor { + TensorSetConstantWithPlace(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()(Place place) const { + set_constant_with_place(context_, tensor_, value_); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) { + TensorSetConstantWithPlace func(context, tensor, value); +#ifdef PADDLE_WITH_CUDA + tensor->place().apply_visitor(func); +#else + func(platform::CPUPlace()); +#endif +} + +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; + } + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; + +template struct ColwiseSum; +template struct ColwiseSum; + +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu new file mode 100644 index 0000000000000000000000000000000000000000..5764da71c8491e27493774569bb663f6a6e835c3 --- /dev/null +++ b/paddle/fluid/operators/math/math_function.cu @@ -0,0 +1,355 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/math_function_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE(platform::dynload::cublasSgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasDgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasSgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); +} + +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + PADDLE_ENFORCE(platform::dynload::cublasDgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); +} + +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +} + +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +} + +template <> +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { + cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + + PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); +} + +template <> +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { + cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); +} + +template <> +void axpy( + const platform::CUDADeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); +} + +template <> +void axpy( + const platform::CUDADeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); +} + +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; + +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; + +DEFINE_GPU_TRANS(1); +DEFINE_GPU_TRANS(2); +DEFINE_GPU_TRANS(3); +DEFINE_GPU_TRANS(4); +DEFINE_GPU_TRANS(5); +DEFINE_GPU_TRANS(6); + +struct TensorSetConstantGPU { + TensorSetConstantGPU(const platform::DeviceContext& context, + framework::Tensor* tensor, float value) + : context_(context), tensor_(tensor), value_(value) {} + + template + void operator()() const { + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, static_cast(value_)); + } + + const platform::DeviceContext& context_; + framework::Tensor* tensor_; + float value_; +}; + +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + framework::VisitDataType(framework::ToDataType(tensor->type()), + TensorSetConstantGPU(context, tensor, value)); +} + +template +__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width, + int num) { + T tmp = 1.0 / width; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + int h = i * tmp; + int w = i - h * width; + c[i] = a[i] + b[w]; + } +} + +template +struct RowwiseAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + int blocks = 512; + int grids = (input.numel() + blocks - 1) / blocks; + RowwiseAddKernel<<>>( + input.data(), vector.data(), output->data(), + static_cast(in_dims[1]), static_cast(input.numel())); + } +}; + +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void ColwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), size); + framework::Tensor one; + one.mutable_data({in_dims[0]}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[0]), static_cast(in_dims[1]), + 1.0, input.data(), one.data(), 0.0, + vector->data()); +} + +template struct RowwiseSum; +// template struct RowwiseSum; +// TODO(zcd): Following ColwiseSum format, need to confirm. +// The RowwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void RowwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); + framework::Tensor one; + one.mutable_data({size}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[1]), static_cast(in_dims[0]), + 1.0, one.data(), input.data(), 0.0, + vector->data()); +} + +template struct RowwiseMean; +template struct RowwiseMean; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h new file mode 100644 index 0000000000000000000000000000000000000000..84916af1f8e5bd55920685b4897858f2da9fde92 --- /dev/null +++ b/paddle/fluid/operators/math/math_function.h @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MKLML +#include +#include +#include +#endif + +#ifdef PADDLE_USE_ATLAS +extern "C" { +#include +#include +} +#endif + +#ifdef PADDLE_USE_OPENBLAS +#include +#include +#endif + +#ifndef LAPACK_FOUND +extern "C" { +#include +int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda, + int* ipiv); +int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda, + int* ipiv); +int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda, + const int* ipiv); +int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda, + const int* ipiv); +} +#endif + +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { + +// Support continuous memory now +// If transA = N, and transB = N +// Then matrixA: M * K, matrixB: K * N, matrixC : M * N +// For more detailed info, please refer to +// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html +template +void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const T alpha, const T* A, const T* B, const T beta, T* C); + +// gemm wrapper with stride args for matrix uncontinuous in memory +template +void gemm(const DeviceContext& context, const bool transA, const bool transB, + const int M, const int N, const int K, const T alpha, const T* A, + const int lda, const T* B, const int ldb, const T beta, T* C, + const int ldc); + +// matrix multiply with continuous memory +template +void matmul(const DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, + T alpha, framework::Tensor* matrix_out, T beta); + +// Batched gemm +template +void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, const int batchCount, const int strideA, + const int strideB); + +template +void gemv(const DeviceContext& context, const bool trans_a, const int M, + const int N, const T alpha, const T* A, const T* B, const T beta, + T* C); + +template +void axpy(const DeviceContext& context, const int n, const T alpha, const T* x, + T* y); + +template +struct Transpose { + void operator()(const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis); +}; + +template +struct SetConstant { + void operator()(const DeviceContext& context, framework::Tensor* tensor, + T num); +}; + +template +void set_constant_with_place(const platform::DeviceContext& context, + framework::Tensor* tensor, float value); + +void set_constant(const platform::DeviceContext& context, + framework::Tensor* tensor, float value); + +template +struct RowwiseAdd { + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& vec, framework::Tensor* output); +}; + +template +struct ColwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseMean { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..a55ed6c58bafafed1d58ff47fb433a9ae58b8261 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +void SetConstant::operator()(const DeviceContext& context, + framework::Tensor* tensor, + T num) { + auto t = framework::EigenVector::Flatten(*tensor); + t.device(*context.eigen_device()) = t.constant(static_cast(num)); +} + +template +void Transpose::operator()( + const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis) { + Eigen::array permute; + for (int i = 0; i < Rank; i++) { + permute[i] = axis[i]; + } + auto in_dim = in.dims(); + auto out_dim = out->dims(); + + auto eigen_in = framework::EigenTensor::From(in); + auto eigen_out = framework::EigenTensor::From(*out); + auto* dev = context.eigen_device(); + eigen_out.device(*dev) = eigen_in.shuffle(permute); +} + +template +void ColwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{0}})); +} + +// Specialize for CPU, since Eigen implement a general reduce. However, +// colwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class ColwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + for (size_t j = 0; j < static_cast(size); ++j) { + if (i == 0) { + out_buf[j] = in_buf[i * size + j]; + } else { + out_buf[j] += in_buf[i * size + j]; + } + } + } + } +}; + +template +void RowwiseMean::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.mean(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseMean { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), height); + auto inv_size = 1.0 / size; + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum * inv_size; + } + } +}; + +template +void RowwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum; + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6cd8e8b35abeb6d321bb075d7b88f0abd8f0fc59 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" +#include "gtest/gtest.h" + +TEST(math_function, gemm_notrans_cblas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemm( + context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, + input3_ptr + 1, 4); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); +} + +TEST(math_function, gemm_trans_clbas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemm( + context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, + input3_ptr + 1, 4); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); +} + +TEST(math_function, zero) { + paddle::framework::Tensor tensor; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* t = tensor.mutable_data({2, 2}, *cpu_place); + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::SetConstant + functor; + functor(context, &tensor, 0); + EXPECT_EQ(t[0], 0); + EXPECT_EQ(t[1], 0); + EXPECT_EQ(t[2], 0); + EXPECT_EQ(t[3], 0); + + functor(context, &tensor, 1); + + EXPECT_EQ(t[0], 1); + EXPECT_EQ(t[1], 1); + EXPECT_EQ(t[2], 1); + EXPECT_EQ(t[3], 1); +} + +template +void GemvTest(int m, int n, bool trans) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; + auto* cpu_place = new paddle::platform::CPUPlace(); + int b_num = trans ? m : n; + int c_num = trans ? n : m; + + T* data_a = mat_a.mutable_data({m, n}, *cpu_place); + T* data_b = vec_b.mutable_data({b_num}, *cpu_place); + T* data_c = vec_c.mutable_data({c_num}, *cpu_place); + for (int i = 0; i < mat_a.numel(); ++i) { + data_a[i] = static_cast(i); + } + for (int i = 0; i < vec_b.numel(); ++i) { + data_b[i] = static_cast(i); + } + + paddle::platform::CPUDeviceContext context(*cpu_place); + paddle::operators::math::gemv( + context, trans, static_cast(m), static_cast(n), 1., data_a, + data_b, 0., data_c); + + if (!trans) { + for (int i = 0; i < m; ++i) { + T sum = 0.0; + for (int j = 0; j < n; ++j) { + sum += data_a[i * n + j] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } else { + for (int i = 0; i < n; ++i) { + T sum = 0.0; + for (int j = 0; j < m; ++j) { + sum += data_a[j * n + i] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } +} + +TEST(math_function, gemv) { + GemvTest(3, 13, false); + GemvTest(4, 5, false); + GemvTest(12, 7, true); + GemvTest(7, 9, true); +} + +TEST(math_funciton, set_constant) { + paddle::framework::Tensor t; + t.Resize({10, 10}); + t.mutable_data(paddle::platform::CPUPlace()); + auto* ctx = new paddle::platform::CPUDeviceContext(); + paddle::operators::math::set_constant(*ctx, &t, 10); + for (int64_t i = 0; i < t.numel(); ++i) { + PADDLE_ENFORCE_EQ(10, t.data()[i]); + } + delete ctx; +} diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..2ef53a8209940e52390cfe52978f3b32b6dabae6 --- /dev/null +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -0,0 +1,255 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" + +TEST(math_function, notrans_mul_trans) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr, 6 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, *gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); + + paddle::framework::Copy(out_gpu, *cpu_place, context, &out); + + float* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(out_ptr[0], 5); + EXPECT_EQ(out_ptr[1], 14); + EXPECT_EQ(out_ptr[2], 14); + EXPECT_EQ(out_ptr[3], 50); + delete gpu_place; +} + +TEST(math_function, trans_mul_notrans) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr, 6 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, *gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); + + paddle::framework::Copy(out_gpu, *cpu_place, context, &out); + + float* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(out_ptr[0], 9); + EXPECT_EQ(out_ptr[1], 12); + EXPECT_EQ(out_ptr[2], 15); + EXPECT_EQ(out_ptr[3], 12); + EXPECT_EQ(out_ptr[4], 17); + EXPECT_EQ(out_ptr[5], 22); + EXPECT_EQ(out_ptr[6], 15); + EXPECT_EQ(out_ptr[7], 22); + EXPECT_EQ(out_ptr[8], 29); + delete gpu_place; +} + +TEST(math_function, gemm_notrans_cublas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu); + paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu); + float* a = input1_gpu.data(); + float* b = input2_gpu.data(); + float* c = input3_gpu.mutable_data(*gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); + + paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); + delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas) { + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; + + int m = 2; + int n = 3; + int k = 3; + auto* cpu_place = new paddle::platform::CPUPlace(); + float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float arr1[6] = {0, 1, 2, 3, 4, 5}; + memcpy(input1_ptr, arr1, 6 * sizeof(float)); + float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + memcpy(input2_ptr, arr2, 12 * sizeof(float)); + float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + memcpy(input3_ptr, arr3, 8 * sizeof(float)); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(*gpu_place); + + paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu); + paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu); + paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu); + float* a = input1_gpu.data(); + float* b = input2_gpu.data(); + float* c = input3_gpu.mutable_data(*gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); + + paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3); + context.Wait(); + + EXPECT_EQ(input3_ptr[0], 0); + EXPECT_EQ(input3_ptr[1], 24); + EXPECT_EQ(input3_ptr[2], 28); + EXPECT_EQ(input3_ptr[3], 32); + EXPECT_EQ(input3_ptr[4], 4); + EXPECT_EQ(input3_ptr[5], 73); + EXPECT_EQ(input3_ptr[6], 86); + EXPECT_EQ(input3_ptr[7], 99); + delete gpu_place; +} + +template +void GemvTest(int m, int n, bool trans) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* data_a = mat_a.mutable_data({m, n}, *cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); + + auto* gpu_place = new paddle::platform::CUDAPlace(0); + paddle::framework::Tensor g_mat_a; + paddle::framework::Tensor g_vec_b; + paddle::framework::Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + + for (int i = 0; i < mat_a.numel(); ++i) { + data_a[i] = static_cast(i); + } + for (int i = 0; i < vec_b.numel(); ++i) { + data_b[i] = static_cast(i); + } + + paddle::platform::CUDADeviceContext context(*gpu_place); + paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a); + paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b); + + paddle::operators::math::gemv( + context, trans, static_cast(m), static_cast(n), 1., g_data_a, + g_data_b, 0., g_data_c); + + paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context, + &vec_c); + + if (!trans) { + for (int i = 0; i < m; ++i) { + T sum = 0.0; + for (int j = 0; j < n; ++j) { + sum += data_a[i * n + j] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } else { + for (int i = 0; i < n; ++i) { + T sum = 0.0; + for (int j = 0; j < m; ++j) { + sum += data_a[j * n + i] * data_b[j]; + } + ASSERT_FLOAT_EQ(data_c[i], sum); + } + } +} + +TEST(math_function, gemv) { + GemvTest(3, 13, false); + GemvTest(3, 13, false); + GemvTest(3, 13, true); + GemvTest(3, 13, true); +} diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..50f79979d99141c8074c55d2b767285dced49f60 --- /dev/null +++ b/paddle/fluid/operators/math/matmul.h @@ -0,0 +1,145 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +// Implements the logic of numpy matmul: +// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html +// +// but allowing also for a, b to be transposed +// +// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported +// yet. +template +class MatMulFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& a, + bool trans_a, const framework::Tensor& b, bool trans_b, + T alpha, framework::Tensor* out, T beta) { + auto dim_a = a.dims(); + auto dim_b = b.dims(); + + PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(), + "Tensors must all be in the same place."); + PADDLE_ENFORCE_GE(dim_a.size(), 1, + "Input tensor a must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_b.size(), 1, + "Input tensor b must be at least 1-dimensional."); + + std::vector out_dim; + int64_t batch_count = 1; + if (dim_a.size() > 3) { + PADDLE_ENFORCE(dim_b.size() == dim_a.size(), + "The dimensions of X and Y must be the same, and both of " + "them should be %d-dimensional.", + dim_b.size()); + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + for (int j = 0; j < dim_a.size() - 2; ++j) { + PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j], + "The %d-th dimension of X and Y must be the same.", + j); + out_dim.push_back(dim_a[j]); + batch_count *= dim_a[j]; + } + } + + int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0, + strideA = 0, strideB = 0; + + switch (dim_a.size()) { + case 1: + // similar to np.matmul: + // prepend dimension 1 (no transpose) or append dimension 1 (transpose) + M = trans_a ? dim_a[0] : 1; + kA = trans_a ? 1 : dim_a[0]; + break; + case 2: + M = trans_a ? dim_a[1] : dim_a[0]; + kA = trans_a ? dim_a[0] : dim_a[1]; + break; + case 3: + batchCountA = dim_a[0]; + M = trans_a ? dim_a[2] : dim_a[1]; + kA = trans_a ? dim_a[1] : dim_a[2]; + strideA = M * kA; + break; + default: + batchCountA = batch_count; + size_t mat_s = dim_a.size() - 2; + M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s]; + kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1]; + strideA = M * kA; + } + + switch (dim_b.size()) { + case 1: + // similar to np.matmul: + // append dimension 1 (no transpose) or prepend dimension 1 (transpose) + kB = trans_b ? 1 : dim_b[0]; + N = trans_b ? dim_b[0] : 1; + break; + case 2: + kB = trans_b ? dim_b[1] : dim_b[0]; + N = trans_b ? dim_b[0] : dim_b[1]; + break; + case 3: + batchCountB = dim_b[0]; + kB = trans_b ? dim_b[2] : dim_b[1]; + N = trans_b ? dim_b[1] : dim_b[2]; + strideB = kB * N; + break; + default: + batchCountB = batch_count; + size_t mat_s = dim_b.size() - 2; + kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s]; + N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1]; + strideB = kB * N; + } + + PADDLE_ENFORCE_EQ( + kA, kB, + "First matrix's width must be equal with second matrix's height."); + if (batchCountA && batchCountB) { + PADDLE_ENFORCE_EQ( + batchCountA, batchCountB, + "When input tensors a and b are both batched, they must have the " + "same batch dimension."); + } + int batchCount = std::max(batchCountA, batchCountB); + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + if (!batchCount) { + // regular matrix multiplication + gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data()); + } else { + // batched matrix multiplication + batched_gemm( + context, transA, transB, M, N, kA, alpha, a.data(), b.data(), + beta, out->data(), batchCount, strideA, strideB); + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc new file mode 100644 index 0000000000000000000000000000000000000000..746328cd45ada637132ac661c87f4ac4710aeaa4 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/maxouting.h" + +namespace paddle { +namespace operators { +namespace math { + +// All tensors are in NCHW format, and the groups must be greater than 1 +template +class MaxOutFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + for (int ph = 0; ph < groups; ++ph) { + T x = input_data[(new_bindex + new_cindex) * groups + + ph * fea_size + f]; + ele = ele > x ? ele : x; + } + output_data[(new_bindex + new_cindex + f)] = ele; + } + } + } + } +}; + +template +class MaxOutGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0 = (blen + clen) * groups + f; + bool continue_match = true; + int output_idx = blen + clen + f; + for (int g = 0; g < groups && continue_match; ++g) { + int input_idx = input_idx0 + fea_size * g; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; + } + } + } + } + } + } +}; + +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu new file mode 100644 index 0000000000000000000000000000000000000000..68e5dfc3c551958c1f201341b8a704d9306ef150 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.cu @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void KernelMaxOut(const int nthreads, const T* input_data, + const int channels, const int input_height, + const int input_width, int groups, + T* output_data) { + const int size = input_height * input_width * channels / groups; + const int feat_len = input_height * input_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int batch_idx = i / size; + int batch_offset = i % size; + int channel_idx = batch_offset / feat_len; + int feat_idx = batch_offset % feat_len; + int data_idx = + (batch_idx * size + channel_idx * feat_len) * groups + feat_idx; + T ele = static_cast(-FLT_MAX); + for (int g = 0; g < groups; ++g) { + T x = input_data[data_idx + g * feat_len]; + ele = ele > x ? ele : x; + } + output_data[i] = ele; + } +} +template +__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, + const T* output_data, const T* output_grad, + T* input_grad, const int channels, + const int input_height, const int input_width, + int groups) { + const int size = input_height * input_width * channels / groups; + const int feat_len = input_height * input_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int batch_idx = i / size; + int batch_offset = i % size; + int channel_idx = batch_offset / feat_len; + int feat_idx = batch_offset % feat_len; + int data_idx = + (batch_idx * size + channel_idx * feat_len) * groups + feat_idx; + int max_index = -1; + bool continue_match = true; + for (int g = 0; g < groups && continue_match; ++g) { + if (input_data[data_idx + g * feat_len] == output_data[i]) { + max_index = data_idx + g * feat_len; + continue_match = false; + break; + } + } + if (max_index != -1) { + input_grad[max_index] += output_grad[index]; + } + } +} +/* + * All tensors are in NCHW format. + */ +template +class MaxOutFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* output, + int groups) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + output_data); + } +}; +/* + * All tensors are in NCHW format. + */ +template +class MaxOutGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups); + } +}; + +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h new file mode 100644 index 0000000000000000000000000000000000000000..0e81790f0aba422f6676cd329def95a642a12239 --- /dev/null +++ b/paddle/fluid/operators/math/maxouting.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +class MaxOutFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); +}; + +template +class MaxOutGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, int groups); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..9adb142f14ea984c598855cf30838f0a1f2e5015 --- /dev/null +++ b/paddle/fluid/operators/math/pooling.cc @@ -0,0 +1,760 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + T ele = pool_process.initial(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute(ele, input_data[h * input_width + w]); + } + } + int pool_size = (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, (static_cast(pool_size))); + output_data[ph * output_width + pw] = ele; + } + } + input_data += input_stride; + output_data += output_stride; + } + } + } +}; + +/* +* All tensors are in NCHW format. +* Ksize, strides, paddings are two elements. These two elements represent height +* and width, respectively. +*/ +template +class Pool2dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_grad_process, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + int pool_size = (hend - hstart) * (wend - wstart); + float scale = 1.0 / pool_size; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_grad_process.compute( + input_data[h * input_width + w], + output_data[ph * output_width + pw], + output_grad_data[ph * output_width + pw], + input_grad_data[h * input_width + w], + static_cast(scale)); + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + bool stop = false; + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + int input_idx = h * input_width + w; + int output_idx = ph * output_width + pw; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + stop = true; + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + int output_idx = (pd * output_height + ph) * output_width + pw; + T ele = pool_process.initial(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute( + ele, + input_data[(d * input_height + h) * input_width + w]); + } + } + } + int pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, static_cast(pool_size)); + output_data[output_idx] = ele; + } + } + } + input_data += input_stride; + output_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_grad_process, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + int pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + float scale = 1.0 / pool_size; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + int output_idx = + (pd * output_height + ph) * output_width + pw; + pool_grad_process.compute( + input_data[input_idx], output_data[output_idx], + output_grad_data[output_idx], + input_grad_data[input_idx], static_cast(scale)); + } + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + bool stop = false; + for (int d = dstart; d < dend && !stop; ++d) { + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + int output_idx = + (pd * output_height + ph) * output_width + pw; + + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += + output_grad_data[output_idx]; + stop = true; + } + } + } + } + } + } + } + input_data += input_stride; + output_data += output_stride; + input_grad_data += input_stride; + output_grad_data += output_stride; + } + } + } +}; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + T1 ele = static_cast(-FLT_MAX); + int index = -1; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (ele < input_data[h * input_width + w]) { + ele = input_data[h * input_width + w]; + index = h * input_width + w; + } + } + } + output_data[ph * output_width + pw] = ele; + mask_data[ph * output_width + pw] = index; + } + } + // offset + input_data += input_stride; + output_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_height = input_grad->dims()[2]; + const int input_width = input_grad->dims()[3]; + const int output_channels = output_grad.dims()[1]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int input_stride = input_height * input_width; + const int output_stride = output_height * output_width; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int n = 0; n < batch_size; ++n) { + for (int c = 0; c < output_channels; ++c) { + for (int ph = 0; ph < output_height; ++ph) { + for (int pw = 0; pw < output_width; ++pw) { + const int output_idx = ph * output_width + pw; + const int input_idx = static_cast(mask_data[output_idx]); + input_grad_data[input_idx] += output_grad_data[output_idx]; + } + } + // offset + input_grad_data += input_stride; + output_grad_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; i++) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + int dstart = pd * stride_depth - padding_depth; + int dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + for (int ph = 0; ph < output_height; ++ph) { + int hstart = ph * stride_height - padding_height; + int hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + for (int pw = 0; pw < output_width; ++pw) { + int wstart = pw * stride_width - padding_width; + int wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + + int output_idx = (pd * output_height + ph) * output_width + pw; + T1 ele = static_cast(-FLT_MAX); + int index = -1; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_idx = (d * input_height + h) * input_width + w; + if (ele < input_data[input_idx]) { + index = input_idx; + ele = input_data[input_idx]; + } + } + } + } + output_data[output_idx] = ele; + mask_data[output_idx] = index; + } + } + } + // offset + input_data += input_stride; + output_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_depth = input_grad->dims()[2]; + const int input_height = input_grad->dims()[3]; + const int input_width = input_grad->dims()[4]; + const int output_channels = output_grad.dims()[1]; + const int output_depth = output_grad.dims()[2]; + const int output_height = output_grad.dims()[3]; + const int output_width = output_grad.dims()[4]; + const int input_stride = input_depth * input_height * input_width; + const int output_stride = output_depth * output_height * output_width; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int n = 0; n < batch_size; ++n) { + for (int c = 0; c < output_channels; ++c) { + for (int pd = 0; pd < output_depth; ++pd) { + for (int ph = 0; ph < output_height; ++ph) { + for (int pw = 0; pw < output_width; ++pw) { + const int output_idx = + (pd * output_height + ph) * output_width + pw; + const int input_idx = static_cast(mask_data[output_idx]); + input_grad_data[input_idx] += output_grad_data[output_idx]; + } + } + } + // offset + input_grad_data += input_stride; + output_grad_data += output_stride; + mask_data += output_stride; + } + } + } +}; + +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..c65632de9066251a94ab3c32d4382d44f4120c2a --- /dev/null +++ b/paddle/fluid/operators/math/pooling.cu @@ -0,0 +1,1041 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void KernelPool2D(const int nthreads, const T* input_data, + const int channels, const int input_height, + const int input_width, const int output_height, + const int output_width, const int ksize_height, + const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, + const int padding_width, PoolProcess pool_process, + T* output_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + T ele = pool_process.initial(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute(ele, input_data[h * input_width + w]); + } + } + int pool_size = (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, (static_cast(pool_size))); + output_data[index] = ele; + } +} + +template +__global__ void KernelPool2DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_height, + const int input_width, const int output_height, const int output_width, + const int ksize_height, const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + PoolProcess pool_process, T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int offsetW = index % input_width + padding_width; + int offsetH = (index / input_width) % input_height + padding_height; + int offsetC = (index / input_width / input_height) % channels; + int batch_idx = index / input_width / input_height / channels; + + int phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + int pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + int phend = min(offsetH / stride_height + 1, output_height); + int pwend = min(offsetW / stride_width + 1, output_width); + T gradient = 0; + T input = input_data[index]; + int output_idx = + (batch_idx * channels + offsetC) * output_height * output_width; + output_data += output_idx; + output_grad += output_idx; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int pool_size = (hend - hstart) * (wend - wstart); + int output_sub_idx = ph * output_width + pw; + pool_process.compute(input, output_data[output_sub_idx], + output_grad[output_sub_idx], gradient, + static_cast(1.0 / pool_size)); + } + } + input_grad[index] = gradient; + } +} + +template +__global__ void KernelMaxPool2DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_height, + const int input_width, const int output_height, const int output_width, + const int ksize_height, const int ksize_width, const int stride_height, + const int stride_width, const int padding_height, const int padding_width, + T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + input_grad += (batch_idx * channels + c) * input_height * input_width; + + T ele = output_data[index]; + int maxIndex = -1; + bool stop = false; + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + if (ele == input_data[h * input_width + w]) { + maxIndex = h * input_width + w; + stop = true; + } + } + } + + if (maxIndex != -1) { + // atomic add + platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); + } + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool2D<<>>( + nthreads, input_data, input_channels, input_height, input_width, + output_height, output_width, ksize_height, ksize_width, stride_height, + stride_width, padding_height, padding_width, pool_process, output_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class Pool2dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool2DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_height, input_width, output_height, output_width, ksize_height, + ksize_width, stride_height, stride_width, padding_height, padding_width, + pool_process, input_grad_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_height, input_width, output_height, output_width, ksize_height, + ksize_width, stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; + +template +__global__ void KernelPool3D(const int nthreads, const T* input_data, + const int channels, const int input_depth, + const int input_height, const int input_width, + const int output_depth, const int output_height, + const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, + const int stride_depth, const int stride_height, + const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, + PoolProcess pool_process, T* output_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T ele = pool_process.initial(); + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + pool_process.compute( + ele, input_data[(d * input_height + h) * input_width + w]); + } + } + } + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + pool_process.finalize(ele, static_cast(pool_size)); + output_data[index] = ele; + } +} + +template +__global__ void KernelPool3DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_depth, + const int input_height, const int input_width, const int output_depth, + const int output_height, const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, const int stride_depth, + const int stride_height, const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, PoolProcess pool_process, + T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int offsetW = index % input_width + padding_width; + int offsetH = (index / input_width) % input_height + padding_height; + int offsetD = + (index / input_width / input_height) % input_depth + padding_depth; + int offsetC = (index / input_width / input_height / input_depth) % channels; + int batch_idx = index / input_width / input_height / input_depth / channels; + + int pdstart = (offsetD < ksize_depth) + ? 0 + : (offsetD - ksize_depth) / stride_depth + 1; + int phstart = (offsetH < ksize_height) + ? 0 + : (offsetH - ksize_height) / stride_height + 1; + int pwstart = (offsetW < ksize_width) + ? 0 + : (offsetW - ksize_width) / stride_width + 1; + int pdend = min((offsetD) / stride_depth + 1, output_depth); + int phend = min((offsetH) / stride_height + 1, output_height); + int pwend = min((offsetW) / stride_width + 1, output_width); + + T gradient = 0; + T input = input_data[index]; + int output_idx = (batch_idx * channels + offsetC) * output_depth * + output_height * output_width; + output_data += output_idx; + output_grad += output_idx; + + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + int output_sub_idx = (pd * output_height + ph) * output_width + pw; + pool_process.compute(input, output_data[output_sub_idx], + output_grad[output_sub_idx], gradient, + static_cast(1.0 / pool_size)); + } + } + } + input_grad[index] = gradient; + } +} + +template +__global__ void KernelMaxPool3DGrad( + const int nthreads, const T* input_data, const T* output_data, + const T* output_grad, const int channels, const int input_depth, + const int input_height, const int input_width, const int output_depth, + const int output_height, const int output_width, const int ksize_depth, + const int ksize_height, const int ksize_width, const int stride_depth, + const int stride_height, const int stride_width, const int padding_depth, + const int padding_height, const int padding_width, T* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T ele = output_data[index]; + bool stop = false; + int maxIdx = -1; + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + input_grad += + (batch_idx * channels + c) * input_depth * input_height * input_width; + + for (int d = dstart; d < dend && !stop; ++d) { + for (int h = hstart; h < hend && !stop; ++h) { + for (int w = wstart; w < wend && !stop; ++w) { + if (ele == input_data[(d * input_height + h) * input_width + w]) { + stop = true; + maxIdx = (d * input_height + h) * input_width + w; + } + } + } + } + if (maxIdx != -1) { + // atomic add + platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + } + } +} + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool3D<<>>( + nthreads, input_data, input_channels, input_depth, input_height, + input_width, output_depth, output_height, output_width, ksize_depth, + ksize_height, ksize_width, stride_depth, stride_height, stride_width, + padding_depth, padding_height, padding_width, pool_process, + output_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class Pool3dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_process, framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = + batch_size * input_channels * input_depth * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelPool3DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_depth, input_height, input_width, output_depth, output_height, + output_width, ksize_depth, ksize_height, ksize_width, stride_depth, + stride_height, stride_width, padding_depth, padding_height, + padding_width, pool_process, input_grad_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_channels, + input_depth, input_height, input_width, output_depth, output_height, + output_width, ksize_depth, ksize_height, ksize_width, stride_depth, + stride_height, stride_width, padding_depth, padding_height, + padding_width, input_grad_data); + } +}; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; + +template +__global__ void KernelMaxPool2dWithIdx( + const int nthreads, const T1* input_data, const int channels, + const int input_height, const int input_width, const int output_height, + const int output_width, const int ksize_height, const int ksize_width, + const int stride_height, const int stride_width, const int padding_height, + const int padding_width, T1* output_data, T2* mask_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int c = (index / output_width / output_height) % channels; + int batch_idx = index / output_width / output_height / channels; + + int hstart = ph * stride_height - padding_height; + int hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + int wstart = pw * stride_width - padding_width; + int wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + + input_data += (batch_idx * channels + c) * input_height * input_width; + T1 ele = -FLT_MAX; + int max_index = -1; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_index = h * input_width + w; + if (ele < input_data[input_index]) { + max_index = input_index; + ele = input_data[input_index]; + } + } + } + output_data[index] = ele; + mask_data[index] = max_index; + } +} + +template +__global__ void KernelMaxPool2DWithIdxGrad( + const int nthreads, const T1* output_grad, const T2* mask_data, + const int channels, const int input_height, const int input_width, + const int output_height, const int output_width, const int ksize_height, + const int ksize_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, T1* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int c_offset = (index / input_width / input_height) % channels; + int batch_idx = index / input_width / input_height / channels; + + int ph_start = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + int pw_start = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + int ph_end = + min((h_offset + padding_height) / stride_height + 1, output_height); + int pw_end = + min((w_offset + padding_width) / stride_width + 1, output_width); + + T1 gradient = 0; + int input_current_featuremap_idx = h_offset * input_width + w_offset; + int output_idx = + (batch_idx * channels + c_offset) * output_height * output_width; + + mask_data += output_idx; + output_grad += output_idx; + for (int ph = ph_start; ph < ph_end; ++ph) { + for (int pw = pw_start; pw < pw_end; ++pw) { + if (mask_data[ph * output_width + pw] == input_current_featuremap_idx) + gradient += output_grad[ph * output_width + pw]; + } + } + input_grad[index] = gradient; + } +} + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_height * output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2dWithIdx<<>>( + nthreads, input_data, input_channels, input_height, input_width, + output_height, output_width, ksize_height, ksize_width, stride_height, + stride_width, padding_height, padding_width, output_data, mask_data); + } +}; + +/* + * All tensors are in NCHW format. + * Ksize, strides, paddings are two elements. These two elements represent + * height and width, respectively. + */ +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_channels = input_grad->dims()[1]; + const int input_height = input_grad->dims()[2]; + const int input_width = input_grad->dims()[3]; + const int output_height = output_grad.dims()[2]; + const int output_width = output_grad.dims()[3]; + const int ksize_height = ksize[0]; + const int ksize_width = ksize[1]; + const int stride_height = strides[0]; + const int stride_width = strides[1]; + const int padding_height = paddings[0]; + const int padding_width = paddings[1]; + + const T2* mask_data = mask.data(); + const T1* output_grad_data = output_grad.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = batch_size * input_channels * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool2DWithIdxGrad<<>>( + nthreads, output_grad_data, mask_data, input_channels, input_height, + input_width, output_height, output_width, ksize_height, ksize_width, + stride_height, stride_width, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; + +template +__global__ void KernelMaxPool3DWithIdx( + const int nthreads, const T1* input_data, const int channels, + const int input_depth, const int input_height, const int input_width, + const int output_depth, const int output_height, const int output_width, + const int ksize_depth, const int ksize_height, const int ksize_width, + const int stride_depth, const int stride_height, const int stride_width, + const int padding_depth, const int padding_height, const int padding_width, + T1* output_data, T2* mask_data) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int pw = index % output_width; + int ph = (index / output_width) % output_height; + int pd = (index / output_width / output_height) % output_depth; + int c = (index / output_width / output_height / output_depth) % channels; + int batch_idx = + index / output_width / output_height / output_depth / channels; + + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + + T1 ele = -FLT_MAX; + int max_index = -1; + input_data += + (batch_idx * channels + c) * input_depth * input_height * input_width; + + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (ele < input_data[(d * input_height + h) * input_width + w]) { + max_index = (d * input_height + h) * input_width + w; + ele = input_data[max_index]; + } + } + } + } + output_data[index] = ele; + mask_data[index] = max_index; + } +} + +template +__global__ void KernelMaxPool3DWithIdxGrad( + const int nthreads, const T1* output_grad, const T2* mask, + const int channels, const int input_depth, const int input_height, + const int input_width, const int output_depth, const int output_height, + const int output_width, const int ksize_depth, const int ksize_height, + const int ksize_width, const int stride_depth, const int stride_height, + const int stride_width, const int padding_depth, const int padding_height, + const int padding_width, T1* input_grad) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; + index += blockDim.x * gridDim.x) { + int w_offset = index % input_width; + int h_offset = (index / input_width) % input_height; + int d_offset = (index / input_width / input_height) % input_depth; + int c_offset = + (index / input_width / input_height / input_depth) % channels; + int batch_idx = index / input_width / input_height / input_depth / channels; + + int pd_start = + (d_offset + padding_depth < ksize_depth) + ? 0 + : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; + int ph_start = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + int pw_start = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + int pd_end = + min((d_offset + padding_depth) / stride_depth + 1, output_depth); + int ph_end = + min((h_offset + padding_height) / stride_height + 1, output_height); + int pw_end = + min((w_offset + padding_width) / stride_width + 1, output_width); + + T1 gradient = 0; + int input_current_feature_map_idx = + (d_offset * input_height + h_offset) * input_width + w_offset; + int output_idx = (batch_idx * channels + c_offset) * output_depth * + output_height * output_width; + mask += output_idx; + output_grad += output_idx; + + for (int pd = pd_start; pd < pd_end; ++pd) { + for (int ph = ph_start; ph < ph_end; ++ph) { + for (int pw = pw_start; pw < pw_end; ++pw) { + if (mask[(pd * output_height + ph) * output_width + pw] == + input_current_feature_map_idx) + gradient += + output_grad[(pd * output_height + ph) * output_width + pw]; + } + } + } + input_grad[index] = gradient; + } +} + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* output, framework::Tensor* mask) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[1]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T1* input_data = input.data(); + T1* output_data = output->mutable_data(context.GetPlace()); + T2* mask_data = mask->mutable_data(context.GetPlace()); + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DWithIdx<<>>( + nthreads, input_data, input_channels, input_depth, input_height, + input_width, output_depth, output_height, output_width, ksize_depth, + ksize_height, ksize_width, stride_depth, stride_height, stride_width, + padding_depth, padding_height, padding_width, output_data, mask_data); + } +}; + +/* + * All tensors are in NCDHW format. + * Ksize, strides, paddings are three elements. These three elements represent + * depth, height and width, respectively. + */ +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad) { + const int batch_size = input_grad->dims()[0]; + const int input_channels = input_grad->dims()[1]; + const int input_depth = input_grad->dims()[2]; + const int input_height = input_grad->dims()[3]; + const int input_width = input_grad->dims()[4]; + const int output_depth = output_grad.dims()[2]; + const int output_height = output_grad.dims()[3]; + const int output_width = output_grad.dims()[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + const T1* output_grad_data = output_grad.data(); + const T2* mask_data = mask.data(); + T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + int nthreads = + batch_size * input_channels * input_depth * input_height * input_width; + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxPool3DWithIdxGrad<<>>( + nthreads, output_grad_data, mask_data, input_channels, input_depth, + input_height, input_width, output_depth, output_height, output_width, + ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, + stride_width, padding_depth, padding_height, padding_width, + input_grad_data); + } +}; + +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..1195038f6a067fde776df8013a2a81e7003489a1 --- /dev/null +++ b/paddle/fluid/operators/math/pooling.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX \ + __FLT_MAX__ // It might need to be placed in another file, but I'm still + // wondering where to put it. + +/* + * \brief Extracting simple operations from pooling. + * Both MaxPool and AvgPool need "initial", "compute" and "finalize" + * operation. + * MaxPool initializes temp variable to the negative maximum to find the + * maximum value in the pooling field. + * AvgPool initializes temp variable to the zero to accumulate all values + * in pool pooling, and finally takes the average. + * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. + */ +template +class MaxPool { + public: + DEVICE inline T initial() { return static_cast(-FLT_MAX); } + DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } + DEVICE inline void finalize(T& y, const T& pool_field) {} +}; + +template +class AvgPool { + public: + DEVICE inline T initial() { return static_cast(0); } + DEVICE inline void compute(T& y, const T& x) { y += x; } + DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } +}; + +template +class MaxPoolGrad { + public: + DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, + T scale) { + dx += dy * (x == y); + } +}; + +template +class AvgPoolGrad { + public: + DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, + T scale) { + dx += (scale * dy); + } +}; + +/* + * \brief Getting pooling results, and calculating gradient. + * + * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the + * number of channels, H and W is the height and width of feature. + * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the + * number of channels, D, H and W is the depth, height and width of feature. + * + * In max pooling, it is possible that the pooling region has multiple maximum + * elements. In this case, we should compute the gradient of the first maximum + * element. + * This is different from average pooling. So we rewrite the max_pool_grad: + * MaxPool2dGradFunctor, MaxPool3dGradFunctor. + */ +template +class Pool2dFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); +}; + +template +class Pool2dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_compute, framework::Tensor* input_grad); +}; + +template +class MaxPool2dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class Pool3dFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); +}; + +template +class Pool3dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + PoolProcess pool_compute, framework::Tensor* input_grad); +}; + +template +class MaxPool3dGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, + const framework::Tensor& output_grad, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +/* + * \brief Getting max pooling results and corresponding max index, and + * calculating gradient. + * In up-sampling-pooling, it is necessary to know max element index. + * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in + * NCDHW format. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); +}; + +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); +}; + +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& output_grad, + const framework::Tensor& mask, std::vector& ksize, + std::vector& strides, std::vector& paddings, + framework::Tensor* input_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc similarity index 100% rename from paddle/operators/math/sampler.cc rename to paddle/fluid/operators/math/sampler.cc diff --git a/paddle/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h similarity index 100% rename from paddle/operators/math/sampler.h rename to paddle/fluid/operators/math/sampler.h diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..01aa37ab35ce906133f6195df5d7014b4fb23d16 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -0,0 +1,298 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct SelectedRowsAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2.height()); + output->set_height(in1_height); + + auto& in1_rows = input1.rows(); + auto& in2_rows = input2.rows(); + std::vector out_rows; + out_rows.reserve(in1_rows.size() + in2_rows.size()); + + // concat rows + out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end()); + out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end()); + output->set_rows(out_rows); + + auto* out_value = output->mutable_value(); + auto& in1_value = input1.value(); + auto& in2_value = input2.value(); + + auto in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + auto in2_place = input2.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + auto out_place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_cpu_place(out_place)); + + auto* out_data = out_value->data(); + auto* in1_data = in1_value.data(); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T)); + + auto* in2_data = in2_value.data(); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T)); + } +}; + +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; + +template +struct SelectedRowsAddTensor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output) { + auto in1_height = input1.height(); + auto in2_dims = input2.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); + PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + + SetConstant functor; + functor(context, output, 0.0); + + auto* in1_data = in1_value.data(); + auto* out_data = output->data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + out_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + + auto out_eigen = framework::EigenVector::Flatten(*output); + auto in2_eigen = framework::EigenVector::Flatten(input2); + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; + } +}; + +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; + +template +struct SelectedRowsAddTo { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, + framework::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + + auto& in1_rows = input1.rows(); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + auto in2_place = input2->place(); + PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->data(); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T)); + } +}; + +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; + +template +struct SelectedRowsAddToTensor { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + } +}; + +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; + +// This is a separated namespace for manipulate SelectedRows typed +// data. Like merge duplicated rows, adding two SelectedRows etc. +// +// Another group of functors is called "scatter updates", which means +// use SelectedRows to update a dense tensor with different Ops, like +// add or mul. +namespace scatter { + +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} + +template +struct MergeAdd { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = FindPos(merge_rows, input_rows[i]); + for (int64_t j = 0; j < input_width; j++) { + out_data[out_i * input_width + j] += input_data[i * input_width + j]; + } + } + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +struct UpdateToTensor { + void operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::ADD: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUB: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] -= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUBBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] - + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + case ScatterOps::MUL: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] *= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIV: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] /= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIVBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] / + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + } + } +}; + +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu new file mode 100644 index 0000000000000000000000000000000000000000..ee3b5d52058f7f04d5eeda1d033171f9eae0d772 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -0,0 +1,385 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { +template +struct SelectedRowsAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2.height()); + output->set_height(in1_height); + + framework::Vector in1_rows(input1.rows()); + auto& in2_rows = input2.rows(); + std::vector out_rows; + out_rows.reserve(in1_rows.size() + in2_rows.size()); + + // concat rows + out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end()); + out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end()); + output->set_rows(out_rows); + + auto* out_value = output->mutable_value(); + auto& in1_value = input1.value(); + auto& in2_value = input2.value(); + + auto in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + + auto* out_data = out_value->data(); + auto* in1_data = in1_value.data(); + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + auto in2_place = input2.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + auto out_place = context.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(out_place)); + + memory::Copy( + boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), + reinterpret_cast(context).stream()); + + auto* in2_data = in2_value.data(); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T), context.stream()); + } +}; + +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; + +namespace { +template +__global__ void SelectedRowsAddTensorKernel(const T* selected_rows, + const int64_t* rows, T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we can not use + // tensor_out[index] += selected_rows[index]; Instead, we have to use + // AtomicAdd to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + } +} +} // namespace + +template +struct SelectedRowsAddTensor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output) { + auto in1_height = input1.height(); + auto in2_dims = input2.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + + auto& in1_value = input1.value(); + framework::Vector in1_rows(input1.rows()); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); + PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* in2_data = input2.data(); + auto* out_data = output->data(); + + SetConstant functor; + functor(context, output, 0.0); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in1_rows.size()); + SelectedRowsAddTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, + in1_row_numel); + + auto out_eigen = framework::EigenVector::Flatten(*output); + auto in2_eigen = framework::EigenVector::Flatten(input2); + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; + } +}; + +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; + +template +struct SelectedRowsAddTo { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, + framework::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + + framework::Vector in1_rows(input1.rows()); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + if (in1_rows.size()) { + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + } + + auto in1_place = input1.place(); + PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + auto in2_place = input2->place(); + PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->data(); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); + } +}; + +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; + +namespace { +template +__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, + const int64_t* rows, + T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + } +} +} // namespace + +template +struct SelectedRowsAddToTensor { + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + framework::Vector in1_rows(input1.rows()); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* in2_data = input2->data(); + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in1_rows.size()); + SelectedRowsAddToTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_row_numel); + } +}; + +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; + +namespace scatter { + +template +__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, + T* out, const int64_t* out_rows, + size_t out_rows_size, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t out_idx; + + if (tid == 0) { + for (size_t i = 0; i < out_rows_size; i++) { + if (input_rows[ty] == out_rows[i]) { + out_idx = i; + } + } + } + + __syncthreads(); + + input += ty * row_numel; + out += out_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(out + index, input[index]); + } +} + +template +struct MergeAdd { + framework::SelectedRows operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + framework::Vector input_rows(input.rows()); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + auto* input_data = input.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, input_rows.size()); + + MergeAddKernel< + T, 256><<(context) + .stream()>>>( + input_data, input_rows.CUDAData(context.GetPlace()), out_data, + out.mutable_rows()->CUDAMutableData(context.GetPlace()), + out.rows().size(), input_width); + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +__global__ void UpdateToTensorKernel(const T* selected_rows, + const int64_t* rows, const ScatterOps& op, + T* tensor_out, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index]; + } + break; + case ScatterOps::ADD: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] += selected_rows[index]; + } + break; + case ScatterOps::SUB: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] -= selected_rows[index]; + } + break; + case ScatterOps::SUBBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] - tensor_out[index]; + } + break; + case ScatterOps::MUL: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] *= selected_rows[index]; + } + break; + case ScatterOps::DIV: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] /= selected_rows[index]; + } + break; + case ScatterOps::DIVBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] / tensor_out[index]; + } + break; + } +} + +template +struct UpdateToTensor { + void operator()(const platform::CUDADeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { + // NOTE: Use SelectedRowsAddToTensor for better performance + // no additional MergeAdd called. + MergeAdd merge_func; + auto merged_in1 = merge_func(context, input1); + + auto in1_height = merged_in1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = merged_in1.value(); + auto& in1_rows = merged_in1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.template data(); + auto* in2_data = input2->data(); + + dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); + dim3 grid(1, in1_rows.size()); + UpdateToTensorKernel<<< + grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(), + op, in2_data, in1_row_numel); + } +}; +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..510a9ed8be6336448971de305279f607282f7658 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + +namespace paddle { +namespace operators { +namespace math { + +// SelectedRows + SelectedRows will simplely concat value and rows. +// The real computation happens in dealing with LoDTensor. +template +struct SelectedRowsAdd { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* output); +}; + +template +struct SelectedRowsAddTensor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::Tensor& input2, framework::Tensor* output); +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddTo { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const int64_t input2_offset, framework::SelectedRows* input2); +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddToTensor { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + +namespace scatter { +// functors for manuplating SelectedRows data +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input); +}; + +template +struct Add { + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); + e_out.device(*context.eigen_device()) = e_in1 + e_in2; + return out; + } +}; + +template +struct Mul { + // multiply two SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); + e_out.device(*context.eigen_device()) = e_in1 * e_in2; + return out; + } + // multiply scalar to SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const T input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + e_out.device(*context.eigen_device()) = input2 * e_in1; + return out; + } +}; + +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = seleted_rows_in / tensor +template +struct UpdateToTensor { + void operator()(const DeviceContext& context, const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + +} // namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..db6b41cd52049a32239eca1a39cd730c11ddc2d8 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" + +TEST(selected_rows_functor, cpu_add) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CPUPlace cpu_place; + CPUDeviceContext ctx(cpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), cpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), cpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), cpu_place); + + SelectedRowsAdd add_functor; + add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + + std::unique_ptr tensor2{new Tensor()}; + tensor2->mutable_data(make_ddim({height, row_numel}), cpu_place); + + SelectedRowsAddTensor add_tensor_functor; + add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); + + auto* tensor2_data = tensor2->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0); +} + +TEST(selected_rows_functor, cpu_add_to) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CPUPlace cpu_place; + CPUDeviceContext ctx(cpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), cpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), cpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), cpu_place); + + SelectedRowsAddTo add_to_functor; + add_to_functor(ctx, *selected_rows1, 0, output.get()); + add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + auto* out_data = output->value().data(); + // input1 value + EXPECT_EQ(out_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); + functor(ctx, tensor1.get(), 3.0); + + SelectedRowsAddToTensor add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + + auto* tensor1_data = tensor1->data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); +} diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3c4bc9244f9ca1771c5f435788cf3789d7c4574 --- /dev/null +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +TEST(selected_rows_functor, gpu_add) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CUDAPlace gpu_place(0); + CPUPlace cpu_place; + CUDADeviceContext ctx(gpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), gpu_place); + + SelectedRowsAdd add_functor; + add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + Tensor out_cpu; + Copy(*out_value, cpu_place, ctx, &out_cpu); + ctx.Wait(); + + auto* out_cpu_data = out_cpu.data(); + // input1 value + EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + functor(ctx, tensor1.get(), 3.0); + + std::unique_ptr tensor2{new Tensor()}; + tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); + + SelectedRowsAddTensor add_tensor_functor; + add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); + + Tensor tensor2_cpu; + Copy(*tensor2, cpu_place, ctx, &tensor2_cpu); + ctx.Wait(); + + auto* tensor2_cpu_data = tensor2_cpu.data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0); +} + +TEST(selected_rows_functor, gpu_add_to) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators::math; + + CUDAPlace gpu_place(0); + CPUPlace cpu_place; + CUDADeviceContext ctx(gpu_place); + SetConstant functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows1{0, 4, 7}; + std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + functor(ctx, in1_value, 1.0); + + std::vector rows2{0, 5, 7, 9}; + std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + functor(ctx, in2_value, 2.0); + + std::unique_ptr output{new SelectedRows()}; + output->set_height(height); + auto* out_value = output->mutable_value(); + + // simplely concat two SelectedRows + out_value->mutable_data(make_ddim({7, 10}), gpu_place); + + SelectedRowsAddTo add_to_functor; + add_to_functor(ctx, *selected_rows1, 0, output.get()); + add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); + + auto out_height = output->height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output->rows(); + + // input1 rows + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + // input2 rows + EXPECT_EQ(out_rows[3], 0); + EXPECT_EQ(out_rows[4], 5); + EXPECT_EQ(out_rows[5], 7); + EXPECT_EQ(out_rows[6], 9); + + Tensor out_cpu; + Copy(*out_value, cpu_place, ctx, &out_cpu); + ctx.Wait(); + + auto* out_cpu_data = out_cpu.data(); + // input1 value + EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0); + EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0); + EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0); + EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0); + // input2 value + EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0); + EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0); + EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0); + EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); + EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); + + std::unique_ptr tensor1{new Tensor()}; + tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + functor(ctx, tensor1.get(), 3.0); + + SelectedRowsAddToTensor add_to_tensor_functor; + add_to_tensor_functor(ctx, *output, tensor1.get()); + + Tensor tensor1_cpu; + Copy(*tensor1, cpu_place, ctx, &tensor1_cpu); + ctx.Wait(); + + auto* tensor1_cpu_data = tensor1_cpu.data(); + // row0: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0); + // row1: 3.0 + EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0); + // row4 : 1.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0); + // row5: 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0); + // row6: 3.0 + EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0); + // row7: 1.0 + 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0); + // row9: 2.0 + 3.0 + EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0); +} diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc new file mode 100644 index 0000000000000000000000000000000000000000..0485070fd9b722bdf9011452b2545e065f46d2ac --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + size_t* index = index_lod.data(); + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + for (int i = 0; i < height; ++i) { + if (is_src_index) { + memcpy(dst_data + i * width, src_data + index[i] * width, + width * sizeof(T)); + } else { + memcpy(dst_data + index[i] * width, src_data + i * width, + width * sizeof(T)); + } + } + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu new file mode 100644 index 0000000000000000000000000000000000000000..450be80ea2fe67aa0e537f06e15f07b38c5751ea --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, + int64_t height, int64_t width, + bool is_src_index) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int id = blockIdx.x + idy * GridDimX; + while (id < height) { + int src_idx = is_src_index ? index[id] : id; + int dst_idx = is_src_index ? id : index[id]; + const T* src_data = src + src_idx * width; + T* dst_data = dst + dst_idx * width; + for (int i = idx; i < width; i += BlockDimX) { + dst_data[i] = src_data[i]; + } + id += BlockDimY * GridDimX; + } +} + +template +class CopyMatrixRowsFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index) { + auto src_dims = src.dims(); + auto dst_dims = dst.dims(); + PADDLE_ENFORCE_EQ(src_dims.size(), 2, + "The src must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(dst_dims.size(), 2, + "The dst must be matrix with rank 2."); + PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], + "The width of src and dst must be same."); + auto height = dst_dims[0]; + auto width = dst_dims[1]; + auto* src_data = src.data(); + auto* dst_data = dst.data(); + + dim3 threads(128, 8); + dim3 grid(8, 1); + auto stream = context.stream(); + CopyMatrixRowsKernel<<>>( + src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, + width, is_src_index); + } +}; + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h new file mode 100644 index 0000000000000000000000000000000000000000..00bd25ab613b198e539368f3233d71618dfc758f --- /dev/null +++ b/paddle/fluid/operators/math/sequence2batch.h @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenMatrix = framework::EigenMatrix; + +template +class CopyMatrixRowsFunctor { + public: + // If is_src_index is true, + // copy the indexed rows of input src to the output dst. + // If is_src_index is false, + // copy the input src to the indexed rows of output dst. + // The indexed rows are based on the input index. + void operator()(const DeviceContext& context, const framework::Tensor& src, + framework::Vector index_lod, framework::Tensor& dst, + bool is_src_index); +}; + +template +class LoDTensor2BatchFunctor { + // Calculate the length of each sequence and + // sort sequence index by the length. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + // + struct SeqInfo { + SeqInfo(int start, int length, int seq_idx) + : start(start), length(length), seq_idx(seq_idx) {} + int start; + int length; + int seq_idx; + }; + + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& lod_tensor, + framework::LoDTensor& batch, bool is_cal_batch_lod, + bool is_reverse = false) const { + if (!is_cal_batch_lod) { + auto lods = batch.lod(); + PADDLE_ENFORCE_GT(lods.size(), 2UL); + PADDLE_ENFORCE_EQ(lods[1].size(), + static_cast(lod_tensor.dims()[0])); + CopyMatrixRowsFunctor to_batch; + to_batch(context, lod_tensor, lods[1], batch, true); + return; + } + + auto lods = lod_tensor.lod(); + auto lod = lods[0]; + PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); + + std::vector seq_info; + for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { + int length = lod[seq_id + 1] - lod[seq_id]; + seq_info.emplace_back(lod[seq_id], length, seq_id); + } + + std::sort(seq_info.begin(), seq_info.end(), + [](SeqInfo a, SeqInfo b) { return a.length > b.length; }); + + // Calculate the start position of each batch. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // num_batch = 5, + // batchIndex = {b0, b1, b2, b3, b4} + // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} + // batch_start_positions[0] = len(b0) + // batch_start_positions[1] = len(b0) + len(b1) + // batch_start_positions[2] = len(b0) + len(b1) + len(b2) + // ... + // seq2batch_idx[12] = {4, 0, 9, + // 5, 1, 10, + // 6, 2, 11, + // 7, 3, + // 8} + // seq_order = {1, 0, 2}, the sort order. + // where 1 is the second sequence, + // 0 is the first sequence, + // 2 is the third sequence. + // The num_batch represents batch size after rearranging the + // input LodTensor. It is also the maximum length of input sequence. + + paddle::framework::LoD batch_lods; + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + + // batch_lods[0] is the start positions for batch LoDTensor + int num_batch = seq_info[0].length; + batch_lods[0].resize(static_cast(num_batch + 1)); + // batch_lods[1] is the raw index in the input LoDTensor + batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); + // batch_lods[2] is the sort order for the input LoDTensor. + batch_lods[2].resize(seq_info.size()); + + size_t* batch_starts = batch_lods[0].data(); + size_t* seq2batch_idx = batch_lods[1].data(); + batch_starts[0] = 0; + for (int n = 0; n < num_batch; n++) { + auto batch_id = static_cast(batch_starts[n]); + for (size_t i = 0; i < seq_info.size(); ++i) { + int seq_len = seq_info[i].length; + int start = seq_info[i].start; + if (n < seq_len) { + seq2batch_idx[batch_id] = + is_reverse ? start + seq_len - 1 - n : start + n; + batch_id++; + } else { + break; + } + } + batch_starts[n + 1] = static_cast(batch_id); + } + size_t* seq_order = batch_lods[2].data(); + for (size_t i = 0; i < seq_info.size(); ++i) { + seq_order[i] = seq_info[i].seq_idx; + } + batch.set_lod(batch_lods); + + CopyMatrixRowsFunctor to_batch; + to_batch(context, lod_tensor, batch_lods[1], batch, true); + } +}; + +template +class Batch2LoDTensorFunctor { + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& batch, + framework::LoDTensor& lod_tensor) const { + auto in_lod = batch.lod(); + PADDLE_ENFORCE_GT(in_lod.size(), 2UL); + PADDLE_ENFORCE_EQ(in_lod[1].size(), + static_cast(lod_tensor.dims()[0])); + CopyMatrixRowsFunctor to_seq; + to_seq(context, batch, in_lod[1], lod_tensor, false); + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad8cd825676c77cc204bbb02f88f422d945f1a2c --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + const int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + for (int64_t i = 0; i < max_sequence_length; ++i) { + for (int64_t j = 0; j < num_sequences; ++j) { + int64_t start_pos = abs_offset_lod[level][j]; + int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos; + if (i < sequence_length) { + // i > 0 => sequence_length > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (int64_t k = 0; k < sequence_width; ++k) { + padding_data[(i * num_sequences + j) * sequence_width + k] = + seq_data[(start_pos + i) * sequence_width + k] * scale; + } + } else { + memset(padding_data + (i * num_sequences + j) * sequence_width, 0, + sequence_width * sizeof(T)); + } + } + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + const int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + for (int64_t i = 0; i < num_sequences; ++i) { + int64_t start_pos = abs_offset_lod[level][i]; + int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; + for (int64_t j = 0; j < sequence_length; ++j) { + // sequence_width > j > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (int64_t k = 0; k < sequence_width; ++k) { + seq_data[(start_pos + j) * sequence_width + k] = + padding_data[(j * num_sequences + i) * sequence_width + k] * + scale; + } + } + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1a390577840db2424185da19f5a5d2b231f25b6 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -0,0 +1,215 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void SequencePaddingKernel(T* padding, T* sequence, + const size_t* sequence_start_positions, + const size_t sequence_width, + const size_t max_sequence_length, + const size_t num_sequences) { + size_t padding_idx = blockIdx.y; + size_t start_pos = sequence_start_positions[padding_idx]; + size_t sequence_length = + sequence_start_positions[padding_idx + 1] - start_pos; + + size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t padding_base_idx = + (sequence_idx * num_sequences + padding_idx) * sequence_width; + size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; + + if (sequence_idx < sequence_length) { + T scale = NormByTimes ? (1.0f / static_cast(sequence_length)) : 1.0f; + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; + } + } else { + /* padding -> sequence */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i]; + } + } + } else if (sequence_idx < max_sequence_length) { + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = 0; + } + } + } +} + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(seq, context.GetPlace(), context, &padding); + padding.Resize(padding_dims); + return; + } + + const int64_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], + static_cast(abs_offset_lod[level].back()), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + int64_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const int64_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const int64_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(padding, context.GetPlace(), context, &seq); + seq.Resize(seq_dims); + return; + } + + const int64_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, + max_sequence_length, num_sequences); + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h new file mode 100644 index 0000000000000000000000000000000000000000..0d84f9dcb3802d82cd385957f66ffe28269b8dfc --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +inline static size_t MaximumSequenceLength(const framework::LoD& lod, + const size_t level) { + const size_t num_sequences = lod[level].size() - 1; + size_t max_sequence_length = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + for (size_t i = 0; i < num_sequences; ++i) { + max_sequence_length = + std::max(max_sequence_length, + abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); + } + return max_sequence_length; +} + +/* + * \brief Padding/Unpadding LoDTensor to/from normal Tensor of the shape + * [max_sequence_length, num_sequences, sequence_width]. + * + * Padding sequence: + * padding[i] = seq[lod[level][i]] + * Unpadding sequence: + * seq[lod[level][i]] = padding[i] + * + * All sequences will be padded to the same length and stored in a transposed + * shape. + * Example: + * seq (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) + * + * \param context device context of this functor. + * \param seq LoDTensor which is stored in sequence format, the shape + * is [total_sequence_length, sequence_width] where + * total_sequence_length is the sum of all sequences' + * length. + * \param padding Tensor which is padded to the same length, the shape is + * [max_sequence_length, num_sequences, sequence_width]. + * \param norm_by_times whether dividing sequence's length. + * + * \note transposition is also done in this functor. + */ +template +class PaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, const framework::LoDTensor& seq, + framework::Tensor& padding, bool norm_by_times); +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, framework::LoDTensor& seq, + const framework::Tensor& padding, bool norm_by_times); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..147cb37da2bbb1bad6fc423b3936a1446e17de15 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_padding.h" +#include + +template +void TestSequencePadding(const paddle::framework::LoD& lod, + const size_t sequence_width) { + paddle::framework::LoDTensor cpu_seq; + paddle::framework::LoDTensor cpu_seq_back; + paddle::framework::LoDTensor seq; + paddle::framework::LoDTensor seq_back; + paddle::framework::Tensor padding; + + const size_t level = lod.size() - 1; + auto seq_dims = + paddle::framework::make_ddim({static_cast(lod[level].back()), + static_cast(sequence_width)}); + + cpu_seq.set_lod(lod); + cpu_seq.mutable_data(seq_dims, paddle::platform::CPUPlace()); + for (int64_t i = 0; i < cpu_seq.numel(); ++i) { + cpu_seq.data()[i] = static_cast(i); + } + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + seq = cpu_seq; + } else { + Copy(cpu_seq, *place, *context, &seq); + seq.set_lod(lod); + } + + const size_t max_sequence_length = + paddle::operators::math::MaximumSequenceLength(lod, level); + const size_t num_sequences = lod[level].size() - 1; + auto padding_dims = + paddle::framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + padding.mutable_data(padding_dims, *place); + paddle::operators::math::PaddingLoDTensorFunctor()( + *context, seq, padding, false); + + seq_back.set_lod(lod); + seq_back.mutable_data(seq_dims, *place); + paddle::operators::math::UnpaddingLoDTensorFunctor()( + *context, seq_back, padding, false); + + if (paddle::platform::is_cpu_place(*place)) { + cpu_seq_back = seq_back; + } else { + Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); + cpu_seq_back.set_lod(lod); + } + + EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel()); + EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims()); + for (int64_t i = 0; i < cpu_seq.numel(); ++i) { + EXPECT_EQ(cpu_seq.data()[i], cpu_seq_back.data()[i]); + } + + delete place; + delete context; +}; + +TEST(Seq2BatchPadding, CPU) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} + +#ifdef PADDLE_WITH_CUDA +TEST(SequencePadding, CUDA) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} +#endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3b87ec93e19c4dcf99fbacf8f882f9f065430de --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_pooling.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto idx_dims = index->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), 1); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, out_dims); + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int* max_index = index->data(); + + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + for (int64_t k = 0; k < dim; ++k) { + out_data[i * dim + k] = in_data[starts[i] * dim + k]; + max_index[i * dim + k] = starts[i]; + } + for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { + for (int64_t k = 0; k < dim; ++k) { + if (in_data[j * dim + k] > out_data[i * dim + k]) { + out_data[i * dim + k] = in_data[j * dim + k]; + max_index[i * dim + k] = j; + } + } + } + } + } +}; + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad) { + auto og_dims = out_grad.dims(); + auto ig_dims = in_grad->dims(); + auto idx_dims = index.dims(); + PADDLE_ENFORCE_GT(og_dims.size(), 1); + PADDLE_ENFORCE_GT(ig_dims.size(), 1); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, og_dims); + + const T* og_data = out_grad.data(); + const int* max_index = index.data(); + T* ig_data = in_grad->data(); + + SetConstant set_zero; + set_zero(context, in_grad, static_cast(0.0)); + int64_t num_seq = og_dims[0]; + int64_t dim = out_grad.numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + for (int64_t j = 0; j < dim; ++j) { + int step_id = max_index[i * dim + j]; + ig_data[step_id * dim + j] = og_data[i * dim + j]; + } + } + } +}; + +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..c4267e992a78fd7e80a45ca1dcc1d81ca0f6e8f5 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +__global__ void KeMaxSequencePool(const T* input, const size_t* starts, + T* output, int* index, int64_t num_seq, + int64_t dim) { + int dim_idx = threadIdx.x; + int seq_id = blockIdx.x; + if (seq_id >= num_seq) return; + size_t start = starts[seq_id]; + size_t end = starts[seq_id + 1]; + + for (int64_t i = dim_idx; i < dim; i += blockDim.x) { + T max_val = static_cast(-FLT_MAX); + int max_id = -1; + for (size_t step_id = start; step_id < end; step_id++) { + if (max_val < input[step_id * dim + i]) { + max_val = input[step_id * dim + i]; + max_id = step_id; + } + } + output[seq_id * dim + i] = max_val; + index[seq_id * dim + i] = max_id; + } +} + +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + auto idx_dims = index->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), static_cast(1)); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, out_dims); + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int* max_index = index->data(); + + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + + dim3 threads(256, 1); + dim3 grid(num_seq, 1); + auto stream = context.stream(); + KeMaxSequencePool<<>>( + in_data, starts.CUDAData(context.GetPlace()), out_data, max_index, + num_seq, dim); + } +}; + +template +__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, + T* in_grad, int64_t num_seq, + int64_t dim) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int col_idx = idx % dim; + if (idx < num_seq * dim) { + int step_id = max_index[idx]; + in_grad[step_id * dim + col_idx] = out_grad[idx]; + } +} + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad) { + auto og_dims = out_grad.dims(); + auto idx_dims = index.dims(); + auto ig_dims = in_grad->dims(); + PADDLE_ENFORCE_GT(og_dims.size(), static_cast(1)); + PADDLE_ENFORCE_GT(ig_dims.size(), static_cast(1)); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + } + PADDLE_ENFORCE_EQ(idx_dims, og_dims); + + const T* og_data = out_grad.data(); + const int* max_index = index.data(); + T* ig_data = in_grad->data(); + + SetConstant set_zero; + set_zero(context, in_grad, static_cast(0.0)); + int64_t num_seq = og_dims[0]; + int64_t dim = out_grad.numel() / num_seq; + + unsigned int blocks = (num_seq * dim + 128 - 1) / 128; + dim3 threads(128, 1); + dim3 grid(blocks, 1); + auto stream = context.stream(); + KeMaxSequencePoolGrad<<>>( + og_data, max_index, ig_data, num_seq, dim); + } +}; + +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..9ba9cad74b54b3d0835ba5f89f6498f9309875cc --- /dev/null +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX __FLT_MAX__ + +template +class MaxSeqPoolFunctor { + public: + void operator()(const DeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index); +}; + +template +class MaxSeqPoolGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& out_grad, + const framework::Tensor& index, + framework::LoDTensor* in_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc new file mode 100644 index 0000000000000000000000000000000000000000..427689b9718db6ed8cbb8525712404c1498faf20 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_scale.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class ScaleLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + framework::LoDTensor& seq, const T* scales) { + const size_t level = 0; + auto lod = seq.lod(); + const size_t num_seq = lod[level].size() - 1; + size_t seq_width = seq.dims()[1]; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + T* seq_data = seq.mutable_data(context.GetPlace()); + for (size_t i = 0; i < num_seq; ++i) { + for (size_t j = lod[level][i] * seq_width; + j < lod[level][i + 1] * seq_width; ++j) { + seq_data[j] *= scales[i]; + } + } + } +}; + +template class ScaleLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu new file mode 100644 index 0000000000000000000000000000000000000000..7c081ed7f4547c4e1200ea7554f83696e404d021 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/sequence_scale.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales, + const size_t seq_width) { + for (int i = threadIdx.x; + i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width; + i += BlockSize) { + int idx = lod[blockIdx.x] * seq_width + i; + seq[idx] *= scales[blockIdx.x]; + } +} + +template +class ScaleLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + framework::LoDTensor& seq, const T* scales) { + const size_t level = 0; + auto lod = seq.lod(); + const size_t num_seq = lod[level].size() - 1; + const size_t seq_width = seq.numel() / seq.dims()[0]; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + T* seq_data = seq.mutable_data(context.GetPlace()); + + SequenceScaleKernel<<< + num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( + seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), + scales, seq_width); + } +}; + +template class ScaleLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..e8e07fd3156cc516c904a1d3d510a7c6eed5b8a0 --- /dev/null +++ b/paddle/fluid/operators/math/sequence_scale.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Scale a sequence. + * + * All sequences will be padded to the same length and stored in a transposed + * shape. + * Example: + * Given: + * seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * scales = (2, 3, 4, 5) + * then: + * result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3) + + * + * \param context Device context of this functor. + * \param seq LoDTensor which is stored in sequence format, the shape + * is [total_sequence_length, sequence_width] where + * total_sequence_length is the sum of all sequences' + * length. + * \param scales Array. The i-th sequence will be scaled by scales[i]. + * \param num_seq Number of sequence + * + */ +template +class ScaleLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, framework::LoDTensor& seq, + const T* scales); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..eab31ec567d15a52661bf5ab2373819d8e1e7ddf --- /dev/null +++ b/paddle/fluid/operators/math/softmax.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu new file mode 100644 index 0000000000000000000000000000000000000000..733d7eeee6d08241783b8f854b25863f9b756c80 --- /dev/null +++ b/paddle/fluid/operators/math/softmax.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" + +namespace paddle { +namespace operators { +namespace math { + +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..b7d67d5f12d83f015297e75c730de27566e5489b --- /dev/null +++ b/paddle/fluid/operators/math/softmax.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class SoftmaxFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); +}; + +template +class SoftmaxGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f7c61cb647e899e25f3d9806c993395e898794ab --- /dev/null +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenMatrix = framework::EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = -64.; + return x < kThreshold ? kThreshold : x; + } +}; + +template +void SoftmaxFunctor::operator()(const DeviceContext& context, + const framework::Tensor* X, + framework::Tensor* Y) { + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto shifted_logits = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); +} + +template +void SoftmaxGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto dot = (softmax * softmax_grad) + .sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class); + logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..e02bc02e0022b82085e29ac6c83677c0accfce49 --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/unpooling.h" +namespace paddle { +namespace operators { +namespace math { +template +class Unpool2dMaxFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + output_data[index] = input_data[i]; + } + input_data += input_feasize; + indices_data += input_feasize; + output_data += output_feasize; + } + } + } +}; +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const int* indices_data = indices.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + input_grad_data[i] = output_grad_data[index]; + } + input_grad_data += input_feasize; + indices_data += input_feasize; + output_grad_data += output_feasize; + } + } + } +}; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e74270fdf16b470ab6438a3283525c725b2d01b --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.cu @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/unpooling.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { +template +__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, + const int* indices_data, + const int input_height, const int input_width, + const int channels, T* output_data, + const int output_height, + const int output_width) { + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + output_data[out_offset + out_index] = input_data[i]; + } +} +template +__global__ void KernelUnpool2dMaxGrad( + const int nthreads, const T* input_data, const int* indices_data, + const int input_height, const int input_width, const int channels, + const T* output_data, const T* output_grad, const int output_height, + const int output_width, T* input_grad) { + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + input_grad[i] = output_grad[out_offset + out_index]; + } +} +/* + * All tensors are in NCHW format. + */ +template +class Unpool2dMaxFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMax<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_height, output_width); + } +}; +/* + * All tensors are in NCHW format. + */ +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMaxGrad<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_grad_data, output_height, + output_width, input_grad_data); + } +}; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h new file mode 100644 index 0000000000000000000000000000000000000000..f245ba7ba873e7f217c577e0c895f9a8d48e9cdf --- /dev/null +++ b/paddle/fluid/operators/math/unpooling.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { +template +class Unpool2dMaxFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output); +}; +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc new file mode 100644 index 0000000000000000000000000000000000000000..ded0bbc74477656310cb4d464c5709173f20f505 --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * vol = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col->dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "mismatching."); + + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx = + ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + +/* + * vol = [input_channels,input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const { + PADDLE_ENFORCE(vol->dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol->dims()[0]; + int input_depth = vol->dims()[1]; + int input_height = vol->dims()[2]; + int input_width = vol->dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "mismatching."); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx = + ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu new file mode 100644 index 0000000000000000000000000000000000000000..35ef24c7f5ffe793a4aefe69807da5ffcf5ced4a --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.cu @@ -0,0 +1,262 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void vol2col(int num_kernels, const T* data_vol, int depth, + int height, int width, int dilation_d, int dilation_h, + int dilation_w, int filter_depth, int filter_height, + int filter_width, int stride_depth, int stride_height, + int stride_width, int padding_depth, int padding_height, + int padding_width, int output_detph, int output_height, + int output_width, T* data_col) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + int w_out = index % output_width; + int h_out = (index / output_width) % output_height; + int d_out = (index / output_width / output_height) % output_detph; + int channel_in = index / output_width / output_height / output_detph; + int channel_out = channel_in * filter_depth * filter_height * filter_width; + int w_in = w_out * stride_width - padding_width; + int h_in = h_out * stride_height - padding_height; + int d_in = d_out * stride_depth - padding_depth; + + data_col += ((channel_out * output_detph + d_out) * output_height + h_out) * + output_width + + w_out; + data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in; + for (int k = 0; k < filter_depth; ++k) { + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int d = d_in + k * dilation_d; + int h = h_in + i * dilation_h; + int w = w_in + j * dilation_w; + int col_idx = (k * dilation_d * height + i * dilation_h) * width + + j * dilation_w; + *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && + w < width) + ? data_vol[col_idx] + : 0; + data_col += output_detph * output_height * output_width; + } + } + } + } +} + +/* + * im = [input_channels,intpu_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col->dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "Mismatching."); + + int num_outputs = + input_channels * output_depth * output_height * output_width; + + const int threads = 1024; + const int blocks = (num_outputs + 1024 - 1) / 1024; + vol2col<<>>( + num_outputs, vol.data(), input_depth, input_height, input_width, + dilations[0], dilations[1], dilations[2], filter_depth, filter_height, + filter_width, strides[0], strides[1], strides[2], paddings[0], + paddings[1], paddings[2], output_depth, output_height, output_width, + col->data()); + } +}; + +template +__global__ void col2vol(int num_kernels, const T* data_col, int depth, + int height, int width, int dilation_d, int dilation_h, + int dilation_w, int filter_depth, int filter_height, + int filter_width, int stride_depth, int stride_height, + int stride_width, int padding_depth, int padding_height, + int padding_width, int output_detph, int output_height, + int output_width, T* data_vol) { + const int d_filter_depth = dilation_d * (filter_depth - 1) + 1; + const int d_filter_height = dilation_h * (filter_height - 1) + 1; + const int d_filter_width = dilation_w * (filter_width - 1) + 1; + + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + T src_val = 0; + int w = index % width + padding_width; + int h = (index / width) % height + padding_height; + int d = (index / width / height) % depth + padding_depth; + int c = index / width / height / depth; + + // compute the start and end of the output + int w_col_start = + (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1; + int w_col_end = min(w / stride_width + 1, output_width); + int h_col_start = + (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1; + int h_col_end = min(h / stride_height + 1, output_height); + int d_col_start = + (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1; + int d_col_end = min(d / stride_depth + 1, output_detph); + + for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + int d_off = (d - d_col * stride_depth); + int h_off = (h - h_col * stride_height); + int w_off = (w - w_col * stride_width); + if (d_off % dilation_d == 0 && h_off % dilation_h == 0 && + w_off % dilation_w == 0) { + d_off /= dilation_d; + h_off /= dilation_h; + w_off /= dilation_w; + + int data_col_index = + (((((c * filter_depth + d_off) * filter_height + h_off) * + filter_width + + w_off))); + data_col_index = + ((data_col_index * output_detph + d_col) * output_height + + h_col) * + output_width + + w_col; + src_val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = src_val; + } +} + +/* + * im = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const { + PADDLE_ENFORCE(vol->dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol->dims()[0]; + int input_depth = vol->dims()[1]; + int input_height = vol->dims()[2]; + int input_width = vol->dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + + PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth, + "input_depth and output_depth are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height, + "input_height and output_height are " + "Mismatching."); + PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width, + "input_width and output_width are " + "Mismatching."); + + int num_kernels = input_channels * input_depth * input_height * input_width; + + const int threads = 1024; + const int blocks = (num_kernels + 1024 - 1) / 1024; + + col2vol<<>>( + num_kernels, col.data(), input_depth, input_height, input_width, + dilations[0], dilations[1], dilations[2], filter_depth, filter_height, + filter_width, strides[0], strides[1], strides[2], paddings[0], + paddings[1], paddings[2], output_depth, output_height, output_width, + vol->data()); + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h new file mode 100644 index 0000000000000000000000000000000000000000..3ce38b2d11f7c64f1004a73ecfc7d85a5a6346ba --- /dev/null +++ b/paddle/fluid/operators/math/vol2col.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +/* + * \brief Converts the feature data of four dimensions(CDHW) into a colData of + * seven dimensions in the Vol2ColFunctor calculation, + * And in the Col2VolFunctor calculation, it is reversed. + * + * \param volData Vol data. + * \param volShape The shape of volData, + * [input_channels, input_depth, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * \param dilations dilation data. + * \param 3-dimension [dilation_depth, dilation_height, dilation_width]. + * + * \param strides stride data. + * \param 3-dimension [stride_depth, stride_height, stride_width]. + * + * \param paddings padding data. + * \param 3-dimension [d_pad, h_pad, w_pad]. + * + * The shape of colData is: + * [input_channels, filter_depth, filter_height, filter_width, output_depth, + * output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_depth * filter_height * filter_width, and the width + * is equal output_depth * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_depth, + * filter_height, + * filter_width, ======> [height, width] + * output_depth, + * output_height, + * output_width] + * + * \note The caller needs to ensure that volShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Vol2ColFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* col) const; +}; + +template +class Col2VolFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, + framework::Tensor* vol) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..af0a900f80e9bec2c5cfd0ec7dc66beabba049d7 --- /dev/null +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -0,0 +1,127 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/vol2col.h" +#include +#include + +template +void testVol2col() { + paddle::framework::Tensor input; + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor output; + paddle::framework::Tensor output_tmp; + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + + /** + * input = [[0, 1, 2, + * 3, 4, 5] + * [6, 7, 8, + * 9, 10, 11]] + * + * output = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5 + * 6, 7 + * 7, 8 + * 9, 10 + * 10, 11] + * + * col2vol = [[0, 2, 2, + * 3, 8, 5] + * [6, 14, 8, + * 9, 20, 11]] + * + */ + int input_depth = 2; + int input_height = 2; + int input_width = 3; + int filter_size = 2; + std::vector strides({1, 1, 1}); + std::vector paddings({0, 0, 0}); + std::vector dilations({1, 1, 1}); + int output_depth = + (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1; + int output_height = + (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1; + int output_width = + (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1; + + // Vol2Col test + float* input_ptr = + input_tmp.mutable_data({1, input_depth, input_height, input_width}, + paddle::platform::CPUPlace()); + float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input_ptr, arr, 12 * sizeof(float)); + + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + output.mutable_data({1, filter_size, filter_size, filter_size, + output_depth, output_height, output_width}, + *place); + + paddle::operators::math::Vol2ColFunctor vol2col; + vol2col(*context, input, dilations, strides, paddings, &output); + + float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output.data(); + } else { + Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp); + out_cfo_ptr = output_tmp.data(); + } + + for (int i = 0; i < 16; ++i) { + EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]); + } + + // Col2Vol test + float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11}; + memset(input_ptr, 0, 12 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + Copy(input_tmp, *place, *context, &input); + } + + paddle::operators::math::Col2VolFunctor col2vol; + col2vol(*context, output, dilations, strides, paddings, &input); + + float* in_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_ptr = input.data(); + } else { + Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + in_ptr = input_tmp.data(); + } + + for (int i = 0; i < 12; ++i) { + EXPECT_EQ(in_ptr[i], col_2_vol[i]); + } +} + +TEST(math, vol2col) { + testVol2col(); +#ifdef PADDLE_WITH_CUDA + testVol2col(); +#endif // PADDLE_WITH_CUDA +} diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..267b0057bf4894705b7e6eddb8e2e2eaa5c18c8e --- /dev/null +++ b/paddle/fluid/operators/matmul_op.cc @@ -0,0 +1,244 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/matmul_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MatMulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of MatMulOp should not be null."); + + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + bool transpose_x = context->Attrs().Get("transpose_X"); + bool transpose_y = context->Attrs().Get("transpose_Y"); + + PADDLE_ENFORCE_GE(dim_x.size(), 1, + "Input tensor X must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_y.size(), 1, + "Input tensor Y must be at least 1-dimensional."); + + std::vector out_dim; + int64_t batch_count = 1; + if (dim_x.size() > 3) { + PADDLE_ENFORCE_EQ( + dim_y.size(), dim_x.size(), + "The dimensions of X and Y must be the same, and both of " + "them should be %d-dimensional.", + dim_x.size()); + + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + for (int j = 0; j < dim_x.size() - 2; ++j) { + PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j], + "The %d-th dimension of X and Y must be the same.", + j); + out_dim.push_back(dim_x[j]); + batch_count *= dim_x[j]; + } + } + + int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0; + bool remove_initial_dim = false, remove_final_dim = false; + + switch (dim_x.size()) { + case 1: + if (transpose_x) { + M = dim_x[0]; + KX = 1; + } else { + M = 1; + KX = dim_x[0]; + remove_initial_dim = true; + } + break; + case 2: + M = transpose_x ? dim_x[1] : dim_x[0]; + KX = transpose_x ? dim_x[0] : dim_x[1]; + break; + case 3: + batchCountX = dim_x[0]; + M = transpose_x ? dim_x[2] : dim_x[1]; + KX = transpose_x ? dim_x[1] : dim_x[2]; + break; + default: + batchCountX = batch_count; + size_t mat_s = dim_x.size() - 2; + M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s]; + KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1]; + break; + } + + switch (dim_y.size()) { + case 1: + if (transpose_y) { + N = dim_y[0]; + KY = 1; + } else { + N = 1; + KY = dim_y[0]; + remove_final_dim = true; + } + break; + case 2: + KY = transpose_y ? dim_y[1] : dim_y[0]; + N = transpose_y ? dim_y[0] : dim_y[1]; + break; + case 3: + batchCountY = dim_y[0]; + KY = transpose_y ? dim_y[2] : dim_y[1]; + N = transpose_y ? dim_y[1] : dim_y[2]; + break; + default: + batchCountY = batch_count; + size_t mat_s = dim_y.size() - 2; + KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s]; + N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1]; + } + + PADDLE_ENFORCE_EQ( + KX, KY, + "First matrix's width must be equal with second matrix's height."); + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + + std::vector dim_out; + if (batchCount) { + if (dim_x.size() > 3) { + dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end()); + } else { + dim_out.push_back(batchCount); + } + } + if (!remove_initial_dim) { + dim_out.push_back(M); + } + if (!remove_final_dim) { + dim_out.push_back(N); + } + if (dim_out.size() == 0) { + // We don't support 0-dimensional Tensors (scalars), so instead + // treat the output as a Tensor of shape (1, ) in this case. + dim_out.push_back(1); + } + context->SetOutputDim("Out", framework::make_ddim(dim_out)); + context->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of MatMul op"); + AddInput("Y", "The second input of MatMul op"); + AddOutput("Out", "The output of MatMul op"); + AddAttr("transpose_X", + R"DOC(If true, use the transpose of `X`. + )DOC") + .SetDefault(false); + AddAttr("transpose_Y", + R"DOC(If true, use the transpose of `Y`. + )DOC") + .SetDefault(false); + AddComment(R"DOC( +MatMul Operator. + + +This operator is used to perform (batched) matrix multiplication +over the last two dimensions of the input tensors `X` and `Y`. + +If a transpose flag is specified, the last two dimensions of the +tensor are transposed. If the tensor is rank-1 of shape [D], then +for `X` it is treated as [1, D] in nontransposed form and as [D, 1] +in transposed form, whereas for `Y` it is the opposite: It is treated +as [D, 1] in nontransposed form and as [1, D] in transposed form. + +Examples without transpose: +- X: [K], Y: [K] => Out: [1] +- X: [K], Y: [K, N] => Out: [N] +- X: [B, M, K], Y: [K] => Out: [B, M] +- X: [M, K], Y: [B, K, N] => Out: [B, M, N] +- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] +- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N] + +The behavior is designed to be similar to the `numpy.matmul` function. +The differences are: +- When the rank of the input data is less than or equal to 3, it + is similar to the `numpy.matmul` function. +- When the rank of the input is greater than 3, the rank of X and + Y must be equal, and the first `rank - 2` dimensions must be equal. +- We add `transpose_X` and `transpose_Y` flags. + +Both the input `X` and `Y` can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input `X`. + +)DOC"); + } +}; + +class MatMulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = context->GetInputDim("X"); + auto y_dims = context->GetInputDim("Y"); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + + if (context->HasOutput(x_grad_name)) { + context->SetOutputDim(x_grad_name, x_dims); + } + if (context->HasOutput(y_grad_name)) { + context->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, + ops::MatMulOpGrad); +REGISTER_OP_CPU_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/fluid/operators/matmul_op.cu.cc b/paddle/fluid/operators/matmul_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..988787f0fe40ccdb266327dbff400a72f5dc448b --- /dev/null +++ b/paddle/fluid/operators/matmul_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/matmul_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f4cae3c91cb03980b915a70242e368852322b365 --- /dev/null +++ b/paddle/fluid/operators/matmul_op.h @@ -0,0 +1,242 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matmul.h" + +namespace paddle { +namespace operators { +namespace matmul_detail { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +using framework::make_ddim; +using framework::vectorize; + +template +class MatMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + Tensor* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + math::MatMulFunctor()( + context.template device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); + } +}; + +template +inline Tensor Reshape(const Tensor& input, const DDim& dims) { + Tensor output; + output.ShareDataWith(input); + output.Resize(dims); + return output; +} + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndM(const Tensor& input) { + Tensor output; + output.ShareDataWith(input); + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + std::vector out_dims = {in_dims[0] * in_dims[1], in_dims[2]}; + output.Resize(make_ddim(out_dims)); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) { + Tensor output; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[1], in_dims[0], in_dims[2]}); + output.mutable_data(context.GetPlace()); + std::vector axis = {1, 0, 2}; + math::Transpose trans; + trans(context, input, &output, axis); + std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; + output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); + } else { + output.ShareDataWith(input); + } + return output; +} + +// Using dimensional constraints on matrix multiplication, it is +// straight-forward to check the following table for when X and Y +// are both matrices. +// +// transpose_X | False | True | False | True +// transpose_Y | False | False | True | True +// -----------+----------+----------+----------+----------- +// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T +// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T +// +// When X is a vector of size K, we treat it instead as a matrix of shape +// (1, K). Similarly, when Y is a vector of size K, we treat it instead as +// a matrix of shape (K, 1). +// +// When X and Y are both 3-dimensional tensors, then the first dimension +// the batch dimension can be ignored and the exact same formulas apply +// as for two matrices. +// +// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end +// up with formulas like +// +// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj} +// +// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N +// to X: (P * M) x K, dOut: (P * M) x N. +template +class MatMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + const Tensor& dout = *context.Input(framework::GradVarName("Out")); + Tensor* dx = context.Output(framework::GradVarName("X")); + Tensor* dy = context.Output(framework::GradVarName("Y")); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + std::vector x_dims = vectorize(x.dims()); + std::vector y_dims = vectorize(y.dims()); + + // If X is a vector, reshape it to a matrix. + if (x_dims.size() == 1) { + x_dims.insert(x_dims.begin(), 1); + } + + // If Y is a vector, reshape it to a matrix. + if (y_dims.size() == 1) { + y_dims.push_back(1); + } + + int batch_count = 0; + // The first rank-2 dimensions are accumulated on the batch_count, and the + // last two dimensions are used for matrix multiplication. + if (x_dims.size() > 3) { + batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1, + std::multiplies()); + } + // Fix the dOut dimensions. + int M = 0, N = 0, batchCountX = 0, batchCountY = 0; + + switch (x_dims.size()) { + case 2: + M = transpose_x ? x_dims[1] : x_dims[0]; + break; + case 3: + batchCountX = x_dims[0]; + M = transpose_x ? x_dims[2] : x_dims[1]; + break; + default: + batchCountX = batch_count; + size_t mat_s = x_dims.size() - 2; + M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s]; + } + + switch (y_dims.size()) { + case 2: + N = transpose_y ? y_dims[0] : y_dims[1]; + break; + case 3: + batchCountY = y_dims[0]; + N = transpose_y ? y_dims[1] : y_dims[2]; + break; + default: + batchCountY = batch_count; + size_t mat_s = y_dims.size() - 2; + N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1]; + } + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + std::vector dout_dims = {M, N}; + if (batchCount) { + if (x_dims.size() > 3) { + dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2); + } else { + dout_dims.insert(dout_dims.begin(), batchCount); + } + } + Tensor X = Reshape(x, make_ddim(x_dims)); + Tensor Y = Reshape(y, make_ddim(y_dims)); + Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + + auto& dev_ctx = context.template device_context(); + if (dx) { + dx->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dX = + (x_dims.size() == 2 && y_dims.size() == 3) + ? CombineBatchAndN(dev_ctx, dOut) + : dOut; + if (x_dims.size() == 2 && y_dims.size() == 3) { + Y = transpose_y ? CombineBatchAndM(Y) + : CombineBatchAndN(dev_ctx, Y); + } + if (transpose_x) { + math::MatMulFunctor()( + dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0)); + } else { + math::MatMulFunctor()( + dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0)); + } + } + + if (dy) { + dy->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3) + ? CombineBatchAndM(dOut) + : dOut; + if (y_dims.size() == 2 && x_dims.size() == 3) { + X = transpose_x ? CombineBatchAndN(dev_ctx, X) + : CombineBatchAndM(X); + dOut = CombineBatchAndM(dOut); + } + if (transpose_y) { + math::MatMulFunctor()( + dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0)); + } else { + math::MatMulFunctor()( + dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0)); + } + } + } +}; +} // namespace matmul_detail + +using matmul_detail::MatMulKernel; +using matmul_detail::MatMulGradKernel; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..eff8b927e52c94a4e19bb10c644cbaa34a7a0581 --- /dev/null +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +class MaxSeqenceLenOp : public framework::OperatorBase { + public: + MaxSeqenceLenOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &rank_table = + scope.FindVar(Input("RankTable"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + int64_t *out_ptr = out->mutable_data({1}, platform::CPUPlace()); + *out_ptr = rank_table.items()[0].length; + } +}; + +class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("RankTable", "The lod_rank_table."); + AddOutput("Out", "The max sequence length."); + AddComment( + R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + } +}; + +class MaxSeqenceLenInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("RankTable")); + context->SetOutputDim("Out", {1}); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp, + paddle::operators::MaxSeqenceLenOpProtoMaker, + paddle::operators::MaxSeqenceLenInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ce12cd4c4d93327dfcf9eb40ae1ff429f703419 --- /dev/null +++ b/paddle/fluid/operators/maxout_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/maxout_op.h" +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of maxout operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of maxout operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + AddAttr( + "groups", + R"DOC("Specifies how many groups the input tensor will be split" + "in the channel dimension. And the number of output channel is " + "the number of channels divided by groups.." + )DOC"); + AddComment(R"DOC( +MaxOut Operator. + +Assumed the input shape is (N, Ci, H, W). +The output shape is (N, Co, H, W). +Then $Co = Ci / groups$ and the operator formula is as follows: + +$$ +y_{si+j} = \max_k x_{gsi + sk + j} \\ +g = groups \\ +s = \frac{input.size}{num\_channels} \\ +0 \le i < \frac{num\_channels}{groups} \\ +0 \le j < s \\ +0 \le k < groups +$$ + +Please refer to Paper: + - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf + - Multi-digit Number Recognition from Street View \ + Imagery using Deep Convolutional Neural Networks: \ + https://arxiv.org/pdf/1312.6082v4.pdf + +)DOC"); + } +}; + +class MaxOutOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MaxoutOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MaxoutOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int groups = ctx->Attrs().Get("groups"); + // check groups > 1 + PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop"); + std::vector output_shape({in_x_dims[0], in_x_dims[1] / groups}); + output_shape.push_back(in_x_dims[2]); + output_shape.push_back(in_x_dims[3]); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class MaxOutOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, + ops::MaxOutOpGrad); +REGISTER_OP_CPU_KERNEL( + maxout, ops::MaxOutKernel); +REGISTER_OP_CPU_KERNEL( + maxout_grad, + ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3f45c90cde754bcbf985092c5cbf31f134a2eee --- /dev/null +++ b/paddle/fluid/operators/maxout_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/maxout_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + maxout, ops::MaxOutKernel, + ops::MaxOutKernel); +REGISTER_OP_CUDA_KERNEL( + maxout_grad, + ops::MaxOutGradKernel, + ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e5de3e3760b99c9bde9dd86e8851dc2f65e4b2d2 --- /dev/null +++ b/paddle/fluid/operators/maxout_op.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/maxouting.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MaxOutKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + int groups = context.template Attr("groups"); + + math::MaxOutFunctor maxout_forward; + maxout_forward(context.template device_context(), *in_x, out, + groups); + } +}; + +template +class MaxOutGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* out = context.Input("Out"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + int groups = context.template Attr("groups"); + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0.0)); + math::MaxOutGradFunctor maxout_backward; + maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1043820345a21cc3a7bbdf45d2914f9319c8e708 --- /dev/null +++ b/paddle/fluid/operators/mean_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_op.h" + +namespace paddle { +namespace operators { + +class MeanOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MeanOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MeanOp should not be null."); + ctx->SetOutputDim("Out", {1}); + } +}; + +class MeanOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of mean op"); + AddOutput("Out", "The output of mean op"); + AddComment(R"DOC( +Mean Operator. + +Out is a scalar which is the mean of all elements in X. + +)DOC"); + } +}; + +class MeanGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class MeanGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("mean_grad"); + grad_op->SetInput("X", Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); +REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); +REGISTER_OP_CPU_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CPU_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ccf2248760a551174953b8b55dc9a69454074885 --- /dev/null +++ b/paddle/fluid/operators/mean_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/mean_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CUDA_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ae162287da6a5b9e826dec9552262e73468ee58a --- /dev/null +++ b/paddle/fluid/operators/mean_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +template +class MeanKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto X = EigenVector::Flatten(*input); + auto y = EigenScalar::From(*output); + auto& place = + *context.template device_context().eigen_device(); + + y.device(place) = X.mean(); + } +}; + +template +class MeanGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto OG = context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar"); + auto IG = context.Output(framework::GradVarName("X")); + IG->mutable_data(context.GetPlace()); + + T ig_size = static_cast(IG->numel()); + Eigen::DSizes bcast(ig_size); + + EigenVector::Flatten(*IG).device( + *context.template device_context().eigen_device()) = + (EigenVector::From(*OG) / ig_size).broadcast(bcast); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..255f55334093213df867852e4d222f0e227e8c5d --- /dev/null +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace operators { + +using LoD = framework::LoD; + +class MergeLoDTensorOp : public framework::OperatorBase { + public: + MergeLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + + auto &x = scope.FindVar(Input("X"))->Get(); + auto &mask = scope.FindVar(Input("Mask"))->Get(); + auto &in_true = scope.FindVar(Input("InTrue"))->Get(); + auto &in_false = + scope.FindVar(Input("InFalse"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + auto level = static_cast(Attr("level")); + + auto &mask_dim = mask.dims(); + + std::unique_ptr cpu_mask{new framework::LoDTensor()}; + if (platform::is_cpu_place(mask.place())) { + cpu_mask->ShareDataWith(mask); + } else if (platform::is_gpu_place(mask.place())) { +#ifdef PADDLE_WITH_CUDA + framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); +#else + PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); +#endif + } + auto *mask_data = cpu_mask->data(); + + int rank = in_true.dims().size(); + platform::Place place = in_true.place(); + std::type_index data_type = in_true.type(); + framework::DDim in_true_dims = + framework::slice_ddim(in_true.dims(), 1, rank); + + int64_t batch_size = in_true.dims()[0] + in_false.dims()[0]; + + auto in_true_dim_vec = framework::vectorize(in_true_dims); + in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size); + + framework::DDim out_dims = framework::make_ddim(in_true_dim_vec); + out->Resize(out_dims); + out->mutable_data(place, data_type); + + auto *out_lod = out->mutable_lod(); + out_lod->clear(); + size_t out_offset = 0; + + // Build LoDTensor `out` + + size_t in_true_idx = 0; + size_t in_false_idx = 0; + for (size_t i = 0; i < static_cast(mask_dim[0]); i++) { + const framework::LoDTensor *input = nullptr; + size_t *in_idx = nullptr; + if (static_cast(mask_data[i]) == 0) { + input = &in_false; + in_idx = &in_false_idx; + } else { + input = &in_true; + in_idx = &in_true_idx; + } + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + input->lod(), *in_idx, (*in_idx) + 1, 0); + auto &lod_length = lod_and_offset.first; + + framework::AppendLoD(out_lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + + PADDLE_ENFORCE_GE(end_offset, start_offset); + size_t len = end_offset - start_offset; + if (len == 0) { + continue; + } + auto slice = out->Slice(out_offset, out_offset + len); + framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx, + &slice); + out_offset += len; + (*in_idx) += 1; + } + + for (size_t i = 0; i < level; i++) { + out_lod->insert(out_lod->begin(), x.lod()[i]); + } + } +}; + +class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input LoDTensor, contains complete lod information to " + "construct the output"); + AddInput("Mask", "A bool column vector which mask the input"); + AddInput("InTrue", "The True branch to be merged"); + AddInput("InFalse", "The False branch to be merged"); + AddOutput("Out", "The merged output LoDTensor"); + AddAttr("level", "(int) the specific lod level to rank.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment( + R"DOC( + Merge True and False branches of LoDTensor into a single Output, + with a mask at certain lod level. X is used to obtain complete + lod information. Please refer to SplitLoDTensorOp.)DOC"); + } +}; + +class MergeLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "MergeLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("Mask"), + "MergeLoDTensorOp must has input Mask."); + PADDLE_ENFORCE(context->HasInput("InTrue"), + "MergeLoDTensorOp must has input InTrue."); + PADDLE_ENFORCE(context->HasInput("InFalse"), + "MergeLoDTensorOp must has input InFalse."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "MergeLoDTensorOp must has output Out"); + + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(mask_dim.size(), 2); + PADDLE_ENFORCE_EQ(mask_dim[1], 1); + + context->SetOutputDim("Out", context->GetInputDim("InTrue")); + } +}; + +class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("split_lod_tensor"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("Mask", Input("Mask")); + grad_op->SetOutput("OutTrue", InputGrad("InTrue")); + grad_op->SetOutput("OutFalse", InputGrad("InFalse")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp, + ops::MergeLoDTensorOpProtoMaker, + ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker); diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..73a6c0b679310ac4108a915836b5ed497853b38b --- /dev/null +++ b/paddle/fluid/operators/mine_hard_examples_op.cc @@ -0,0 +1,330 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("ClsLoss")->type()), + ctx.device_context()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a35d668ccfa8a95cd41c4a943aad6ff915cc7dd --- /dev/null +++ b/paddle/fluid/operators/minus_op.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/minus_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class MinusOp : public framework::OperatorWithKernel { + public: + MinusOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MinusOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of MinusOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MinusOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ( + x_dims, y_dims, + "Minus operator must take two tensor with same num of elements"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MinusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The left tensor of minus operator."); + AddInput("Y", "The right tensor of minus operator."); + AddOutput("Out", "The output tensor of minus operator."); + + AddComment(R"DOC( +Minus Operator. + +Equation: + + $Out = X - Y$ + +Both the input `X` and `Y` can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input `X`. + +)DOC"); + } +}; + +class MinusGradMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + std::vector> ops; + auto x_g = InputGrad("X"); + if (!x_g.empty()) { + auto *x_g_op = new framework::OpDesc(); + x_g_op->SetType("scale"); + x_g_op->SetInput("X", OutputGrad("Out")); + x_g_op->SetOutput("Out", x_g); + x_g_op->SetAttr("scale", 1.0f); + ops.emplace_back(x_g_op); + } + + auto y_g = InputGrad("Y"); + if (!y_g.empty()) { + auto *y_g_op = new framework::OpDesc(); + y_g_op->SetType("scale"); + y_g_op->SetInput("X", OutputGrad("Out")); + y_g_op->SetOutput("Out", y_g); + y_g_op->SetAttr("scale", -1.0f); + ops.emplace_back(y_g_op); + } + + return ops; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker); +REGISTER_OP_CPU_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce0b1fdc0419fead915511581270fb9984df9dc5 --- /dev/null +++ b/paddle/fluid/operators/minus_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/minus_op.h" + +REGISTER_OP_CUDA_KERNEL( + minus, + paddle::operators::MinusKernel); diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dc94cbbeca264536669716beb5318432ba9689a4 --- /dev/null +++ b/paddle/fluid/operators/minus_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class MinusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* left_tensor = context.Input("X"); + auto* right_tensor = context.Input("Y"); + auto* out_tensor = context.Output("Out"); + + out_tensor->mutable_data(context.GetPlace()); + auto& dev = + *context.template device_context().eigen_device(); + framework::EigenVector::Flatten(*out_tensor).device(dev) = + framework::EigenVector::Flatten(*left_tensor) - + framework::EigenVector::Flatten(*right_tensor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2d16531658d42556756e75895b7250a529f13df --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/modified_huber_loss_op.h" + +namespace paddle { +namespace operators { + +class ModifiedHuberLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same."); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1."); + + ctx->SetOutputDim("IntermediateVal", x_dims); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + } +}; + +class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input tensor of modified huber loss op. " + "X is 2-D tensor with shape [batch_size, 1]."); + AddInput("Y", + "The target labels of modified huber loss op. " + "The shape of Y is the same as X. Values of Y must be 0 or 1."); + AddOutput("IntermediateVal", + "Variable to save intermediate result which will be reused in " + "backward processing.") + .AsIntermediate(); + AddOutput("Out", "Classification loss for X."); + AddComment(R"DOC( +Modified Huber Loss Operator. + +This operator is used in binary classification problem. The shape of +input X and target Y are both [N, 1] and so is the shape of the output loss. +Since target Y is not differentiable, calculating gradient for Y is illegal. +The formula of modified huber loss is: + +$$ +L(y, f(x)) = +\begin{cases} +(\max(0, 1 - yf(x)))^2, \text{if} \ yf(x) >= -1 \\ + -4yf(x), \quad \text{otherwise} +\end{cases} +$$ + +Make sure the values of target label Y are in {0, 1} here. This operator will +scale values of Y to {-1, +1} when computing losses and gradients. + +)DOC"); + } +}; + +class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"), + "Intermediate value must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) must not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ( + intermediate_dims, x_dims, + "The shape of X and intermediate value must be the same."); + PADDLE_ENFORCE_EQ(out_grad_dims, x_dims, + "The shape of Input(Out@Grad) and X must be the same."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp, + ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad, + ops::ModifiedHuberLossGradOp); + +REGISTER_OP_CPU_KERNEL( + modified_huber_loss, + ops::ModifiedHuberLossKernel); +REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..69ac2b1ed546a4755cd4e8d52a7f8b98b4f0e7b9 --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/modified_huber_loss_op.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +struct ModifiedHuberLossBackward { + template + HOSTDEVICE void operator()(Tuple t) const { + auto inter_val = thrust::get<1>(t); + auto y_val = thrust::get<2>(t); + auto out_grad = thrust::get<3>(t); + if (inter_val < -1) { + thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad; + } else if (inter_val < 1) { + thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad; + } else { + thrust::get<0>(t) = 0; + } + } +}; + +template +class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Y"); + auto* in1 = context.Input("IntermediateVal"); + auto* in2 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + + if (out0) { + auto counts = framework::product(in1->dims()); + auto y_ptr = thrust::device_pointer_cast(in0->data()); + auto inter_val_ptr = thrust::device_pointer_cast(in1->data()); + auto out_grad_ptr = thrust::device_pointer_cast(in2->data()); + thrust::device_ptr x_grad_ptr( + out0->mutable_data(context.GetPlace())); + + auto iter_begin = thrust::make_zip_iterator( + thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr)); + + auto iter_end = thrust::make_zip_iterator( + thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts, + y_ptr + counts, out_grad_ptr + counts)); + + thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + modified_huber_loss, + ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a470a45e13b55a33a8485cac2038ce1cc761a3f3 --- /dev/null +++ b/paddle/fluid/operators/modified_huber_loss_op.h @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +struct CheckLabelValue { + HOSTDEVICE T operator()(const T& val) const { + PADDLE_ASSERT(val == static_cast(0) || val == static_cast(1)); + } +}; + +template +struct ModifiedHuberLossForward { + HOSTDEVICE T operator()(const T& val) const { + if (val < -1) { + return -4 * val; + } else if (val < 1) { + return (1 - val) * (1 - val); + } else { + return static_cast(0); + } + } +}; + +template +class ModifiedHuberLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("IntermediateVal"); + auto* out1 = context.Output("Out"); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto& place = + *context.template device_context().eigen_device(); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + // make sure value's of Y in {0, 1} + y.unaryExpr(CheckLabelValue()); + auto inter_val = EigenVector::Flatten(*out0); + // scale y to {-1, +1} and compute x * y + inter_val.device(place) = x * (2 * y - static_cast(1)); + auto loss = EigenVector::Flatten(*out1); + loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward()); + } +}; + +// CPU backward kernel +template +class ModifiedHuberLossGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("Y"); + auto* in1 = context.Input("IntermediateVal"); + auto* in2 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + + if (out0) { + const T* y_ptr = in0->data(); + const T* inter_val_ptr = in1->data(); + const T* out_grad_ptr = in2->data(); + size_t counts = static_cast(framework::product(in1->dims())); + T* x_grad_ptr = out0->mutable_data(context.GetPlace()); + for (size_t i = 0; i < counts; ++i) { + if (inter_val_ptr[i] < -1) { + x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i]; + } else if (inter_val_ptr[i] < 1) { + x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) * + out_grad_ptr[i]; + } else { + x_grad_ptr[i] = 0; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3950ac99da93687596f62c6f020f4add1fb04ba --- /dev/null +++ b/paddle/fluid/operators/momentum_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/momentum_op.h" + +namespace paddle { +namespace operators { + +class MomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(param) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(grad) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Velocity"), + "Input(velocity) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of Momentum should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), + "Output(VelocityOut) of Momentum should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Velocity"), + "Param and Velocity of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, + "Learning_rate should be a scalar"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("VelocityOut", param_dim); + } +}; + +class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated"); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter"); + AddInput("Velocity", + "(Tensor, default Tensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated"); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "Input learning rate"); + + AddOutput("ParamOut", + "(Tensor) This output is updated parameter. " + "It shared memory with Input(Param)."); + AddOutput("VelocityOut", + "(Tensor) This output is updated velocity. " + "It shared memory with Input(Velocity)."); + + AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("use_nesterov", + "(bool, default false) " + "Use Nesterov Momentum") + .SetDefault(false); + AddComment(R"DOC( +Momentum Optimizer. + +This optimizer has a flag for Nestrov Momentum. +The update equations are as follows: + +$$ +velocity = mu * velocity + gradient \\ +if (use\_nesterov): \\ + param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\ +else: \\ + param = param - learning\_rate * velocity. \\ +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); +REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..28a14cd4b219b86ba4922a3485b99c2c861a74d9 --- /dev/null +++ b/paddle/fluid/operators/momentum_op.cu @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +__global__ void MomentumKernel(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, + const int64_t num, bool use_nesterov, T* p_out, + T* v_out) { + T lr = learning_rate[0]; + if (use_nesterov) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T g_val = g[i]; + T v_new = v[i] * mu + g_val; + v_out[i] = v_new; + p_out[i] = p[i] - (g_val - v_new * mu) * lr; + } + } else { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T v_new = v[i] * mu + g[i]; + v_out[i] = v_new; + p_out[i] = p[i] - lr * v_new; + } + } +} + +template +class MomentumOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + T* p_out = param_out->mutable_data(ctx.GetPlace()); + T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto* p = param->data(); + auto* v = velocity->data(); + auto* g = grad->data(); + auto* lr = learning_rate->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + MomentumKernel<<>>( + p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fdab86b24eefe15b85f4ca6a49e54ea67c1e7bdf --- /dev/null +++ b/paddle/fluid/operators/momentum_op.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class MomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto p_out = framework::EigenVector::Flatten(*param_out); + auto v_out = framework::EigenVector::Flatten(*velocity_out); + + auto p = framework::EigenVector::Flatten(*param); + auto v = framework::EigenVector::Flatten(*velocity); + auto g = framework::EigenVector::Flatten(*grad); + auto* lr = learning_rate->data(); + + v_out = v * mu + g; + if (use_nesterov) { + p_out = p - (g - v_out * mu) * lr[0]; + } else { + p_out = p - lr[0] * v_out; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c9375d8ea1297f5201b6294f61119f6b5603d988 --- /dev/null +++ b/paddle/fluid/operators/mul_op.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mul_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MulOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MulOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims"); + int y_num_col_dims = ctx->Attrs().Get("y_num_col_dims"); + + VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims + << " x_num_col_dims=" << x_num_col_dims + << " y_num_col_dims=" << y_num_col_dims; + + PADDLE_ENFORCE_GT( + x_dims.size(), x_num_col_dims, + "The input tensor X's rank of MulOp should be larger than " + "x_num_col_dims."); + PADDLE_ENFORCE_GT( + y_dims.size(), y_num_col_dims, + "The input tensor Y's rank of MulOp should be larger than " + "y_num_col_dims."); + + auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); + auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); + + PADDLE_ENFORCE_EQ( + x_mat_dims[1], y_mat_dims[0], + "First matrix's width must be equal with second matrix's height."); + std::vector output_dims; + output_dims.reserve( + static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); + + for (int i = 0; i < x_num_col_dims; ++i) { + output_dims.push_back(x_dims[i]); + } + + for (int i = y_num_col_dims; i < y_dims.size(); ++i) { + output_dims.push_back(y_dims[i]); + } + + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), The first input tensor of mul op."); + AddInput("Y", "(Tensor), The second input tensor of mul op."); + AddOutput("Out", "(Tensor), The output tensor of mul op."); + AddAttr( + "x_num_col_dims", + R"DOC((int, default 1), The mul_op can take tensors with more than two + dimensions as its inputs. If the input $X$ is a tensor with more + than two dimensions, $X$ will be flattened into a two-dimensional + matrix first. The flattening rule is: the first `num_col_dims` + will be flattened to form the first dimension of the final matrix + (the height of the matrix), and the rest `rank(X) - num_col_dims` + dimensions are flattened to form the second dimension of the final + matrix (the width of the matrix). As a result, height of the + flattened matrix is equal to the product of $X$'s first + `x_num_col_dims` dimensions' sizes, and width of the flattened + matrix is equal to the product of $X$'s last `rank(x) - num_col_dims` + dimensions' size. For example, suppose $X$ is a 6-dimensional + tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. + Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = + [24, 30]. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); + AddAttr( + "y_num_col_dims", + R"DOC((int, default 1), The mul_op can take tensors with more than two, + dimensions as its inputs. If the input $Y$ is a tensor with more + than two dimensions, $Y$ will be flattened into a two-dimensional + matrix first. The attribute `y_num_col_dims` determines how $Y$ is + flattened. See comments of `x_num_col_dims` for more details. + )DOC") + .SetDefault(1) + .EqualGreaterThan(1); + AddComment(R"DOC( +Mul Operator. + +This operator is used to perform matrix multiplication for input $X$ and $Y$. + +The equation is: + +$$Out = X * Y$$ + +Both the input $X$ and $Y$ can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input $X$. + +)DOC"); + } +}; + +class MulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + auto x_mat_dims = framework::flatten_to_2d( + x_dims, ctx->Attrs().Get("x_num_col_dims")); + auto y_mat_dims = framework::flatten_to_2d( + y_dims, ctx->Attrs().Get("y_num_col_dims")); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, + ops::MulOpShapeInference, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); +REGISTER_OP_CPU_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CPU_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f605fd84fb802a079f9ed13afc032cc2d6c2b0c --- /dev/null +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mul_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CUDA_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..745989f07f3646ab5d59f3d292030f3eb52dac49 --- /dev/null +++ b/paddle/fluid/operators/mul_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + const Tensor* y = context.Input("Y"); + Tensor* z = context.Output("Out"); + const Tensor x_matrix = + x->dims().size() > 2 + ? framework::ReshapeToMatrix( + *x, context.template Attr("x_num_col_dims")) + : *x; + const Tensor y_matrix = + y->dims().size() > 2 + ? framework::ReshapeToMatrix( + *y, context.template Attr("y_num_col_dims")) + : *y; + + z->mutable_data(context.GetPlace()); + auto z_dim = z->dims(); + if (z_dim.size() != 2) { + z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + math::matmul( + context.template device_context(), x_matrix, false, + y_matrix, false, 1, z, 0); + if (z_dim.size() != 2) { + z->Resize(z_dim); + } + } +}; + +template +class MulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int x_num_col_dims = ctx.template Attr("x_num_col_dims"); + int y_num_col_dims = ctx.template Attr("y_num_col_dims"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; + const Tensor* dout = ctx.Input(framework::GradVarName("Out")); + + Tensor dout_mat; + dout_mat.ShareDataWith(*dout); + dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], + framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); + + Tensor* dx = ctx.Output(framework::GradVarName("X")); + Tensor* dy = ctx.Output(framework::GradVarName("Y")); + auto& dev_ctx = ctx.template device_context(); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + Tensor dx_matrix = dx->dims().size() > 2 + ? framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; + + // dx = dout * y'. dx: M x K, dout : M x N, y : K x N + math::matmul(dev_ctx, dout_mat, false, y_matrix, true, + 1, &dx_matrix, 0); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + Tensor dy_matrix = dy->dims().size() > 2 + ? framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; + // dy = x' * dout. dy K x N, dout : M x N, x : M x K + math::matmul(dev_ctx, x_matrix, true, dout_mat, false, + 1, &dy_matrix, 0); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..168e6f85d6ac9f8d6522afe871d82e708da63227 --- /dev/null +++ b/paddle/fluid/operators/multiclass_nms_op.cc @@ -0,0 +1,393 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +constexpr int64_t kOutputDim = 6; +constexpr int64_t kBBoxSize = 4; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + + PADDLE_ENFORCE_EQ(box_dims.size(), 3, + "The rank of Input(BBoxes) must be 3."); + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3."); + PADDLE_ENFORCE_EQ(box_dims[2], 4, + "The 2nd dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2], + "The 1st dimensiong of Input(BBoxes) must be equal to " + "3rd dimension of Input(Scores), which represents the " + "predicted bboxes."); + + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {box_dims[1], 6}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Scores")->type()), + ctx.device_context()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + std::map>& indices, + int& num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + Tensor score = scores.Slice(c, c + 1); + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k, + &(indices[c])); + num_det += indices[c].size(); + } + + num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(indices); + num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, + std::map>& selected_indices, + Tensor* outs) const { + int predict_dim = scores.dims()[1]; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + odata[count * kOutputDim] = label; // label + odata[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + + int64_t batch_size = score_dims[0]; + int64_t class_num = score_dims[1]; + int64_t predict_dim = score_dims[2]; + int64_t box_dim = boxes->dims()[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + + Tensor ins_boxes = boxes->Slice(i, i + 1); + ins_boxes.Resize({predict_dim, box_dim}); + + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1}, ctx.GetPlace()); + od[0] = -1; + } else { + outs->mutable_data({num_kept, kOutputDim}, ctx.GetPlace()); + for (int64_t i = 0; i < batch_size; ++i) { + Tensor ins_score = scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + + Tensor ins_boxes = boxes->Slice(i, i + 1); + ins_boxes.Resize({predict_dim, box_dim}); + + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("BBoxes", + "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the " + "predicted locations of M bounding bboxes, N is the batch size. " + "Each bounding box has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax]."); + AddInput("Scores", + "(Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 1st dimension of BBoxes. "); + AddAttr( + "background_label", + "(int64_t, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, defalut: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], No is the total " + "number of detections in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. + +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. + +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. + +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are 0, and the Out only contains one +value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f89b00376ba7a759419fa60efe80575b6a8d1f2e --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/multiplex_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class MultiplexOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null."); + PADDLE_ENFORCE(!ctx->Inputs("X").empty(), + "MultiInput(X) shouldn't be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null."); + auto ids_dim = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE( + ids_dim.size() == 2 && ids_dim[1] == 1, + "The index tensor must be a vector with size batchSize x 1."); + + auto ins_dims = ctx->GetInputsDim("X"); + auto num_ins = ins_dims.size(); + PADDLE_ENFORCE(num_ins > 1, + "multiplex operator should have more than " + "one candidate input tensors."); + + auto in_dim = ins_dims[0]; + PADDLE_ENFORCE(in_dim.size() >= 2, + "The rank of candidate tensors must be not less than 2."); + for (size_t i = 1; i < num_ins; i++) { + auto dim = ins_dims[i]; + PADDLE_ENFORCE(in_dim == dim, + "All the candidate tensors must have the same size."); + } + ctx->SetOutputDim("Out", in_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); + } +}; + +class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ids", "The index tensor of multiplex operator."); + AddInput("X", "The candidate tensors of multiplex operator.") + .AsDuplicable(); + AddOutput("Out", "The output tensor of multiplex operator."); + AddComment(R"DOC( +Multiplex Operator. + +Multiplex multiple tensors according to the index provided by the index tensor. + +Ids: the index tensor. +X[0 : N - 1]: the candidate tensors for output (N >= 2). +For each index i from 0 to batchSize - 1, the output is the i-th row of the +the (Ids[i])-th tensor. + +For i-th row of the output tensor: + +$$y[i] = x_{k}[i]$$ + +where `y` is the output tensor, `x_{k}` is the k-th input tensor, +and `k = Ids[i]`. + +)DOC"); + } +}; + +class MultiplexGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); + PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), + "Output(X@Grad) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); +REGISTER_OP_CPU_KERNEL( + multiplex, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel); +REGISTER_OP_CPU_KERNEL( + multiplex_grad, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3ef7ef1dfcd04d59573bc6c726fef757f5f2ce23 --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.cu @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/multiplex_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MultiplexGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto ins = ctx.MultiInput("X"); + auto* ids = ctx.Input("Ids"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + // copy index to cpu + Tensor index_t_cpu; + Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.cuda_device_context().stream(); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); + PADDLE_ENFORCE_LT((size_t)k, ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T), stream); + } + } +}; + +template +class MultiplexGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto* ids = ctx.Input("Ids"); + auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + for (size_t i = 0; i < d_ins.size(); i++) { + if (d_ins[i]) { + d_ins[i]->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*d_ins[i]); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); + } + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + // copy index to cpu + Tensor index_t_cpu; + Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + auto* index = index_t_cpu.data(); + + auto stream = ctx.cuda_device_context().stream(); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T), stream); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + multiplex, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel); +REGISTER_OP_CUDA_KERNEL( + multiplex_grad, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h new file mode 100644 index 0000000000000000000000000000000000000000..682117cb1b4581560d6b4a615d97e2d18a91ffd6 --- /dev/null +++ b/paddle/fluid/operators/multiplex_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace operators { + +template +class MultiplexCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto ins = ctx.MultiInput("X"); + auto ids = ctx.Input("Ids"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto index = ids->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); + PADDLE_ENFORCE_LT(static_cast(k), ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T)); + } + } +}; + +template +class MultiplexGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* ids = ctx.Input("Ids"); + auto ins = ctx.MultiInput("X"); + auto d_ins = + ctx.MultiOutput(framework::GradVarName("X")); + for (size_t i = 0; i < d_ins.size(); i++) { + if (d_ins[i]) { + d_ins[i]->mutable_data(ctx.GetPlace()); + auto t = framework::EigenVector::Flatten(*d_ins[i]); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); + } + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto* index = ids->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T)); + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt similarity index 100% rename from paddle/operators/nccl/CMakeLists.txt rename to paddle/fluid/operators/nccl/CMakeLists.txt diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a8ce932ec51a3d85ef04a2c07e08186929632f2 --- /dev/null +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace platform {} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h new file mode 100644 index 0000000000000000000000000000000000000000..6e78613239e6c401bad5aa80746000c5b47cd031 --- /dev/null +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +constexpr int kInvalidGPUId = -1; + +struct Communicator { + std::vector comms_; + std::unordered_map comm_id_map_; + bool inited_; + + Communicator() {} + + int GetCommId(int device_id) const { return comm_id_map_.at(device_id); } + + void InitAll(const std::vector& gpus) { + comms_.resize(gpus.size()); + inited_ = false; + for (size_t i = 0; i < gpus.size(); ++i) { + comm_id_map_[gpus[i]] = i; + } + PADDLE_ENFORCE( + dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data())); + inited_ = true; + } + + ~Communicator() { + if (inited_) { + for (size_t i = 0; i < comms_.size(); ++i) { + // FIXME(dzh) : PADDLE_ENFORCE return void + dynload::ncclCommDestroy(comms_[i]); + } + } + } + + DISABLE_COPY_AND_ASSIGN(Communicator); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..52420ceba0de0323dae000aa301ce7838b3311b6 --- /dev/null +++ b/paddle/fluid/operators/nccl_op.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" + +namespace paddle { +namespace operators { + +// NCCLinitOp +class NCCLInitOp : public framework::OperatorBase { + public: + NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + const auto &name = Output("Communicator"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), + "Can not find variable '%s' in the scope.", name); + std::vector gpus = Attr>("gpus"); + PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty."); + + if (scope.FindVar(name) == nullptr) { + PADDLE_THROW("Output(Communicator) is needed for ncclInit operator."); + } + + platform::Communicator *comm = + scope.FindVar(name)->GetMutable(); + comm->InitAll(gpus); + } +}; + +class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Communicator", + "Create Communicator for communicating between gpus"); + AddAttr>("gpus", "(vector) GPU id lists"); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(R"DOC( +NCCLInit Operator. + +Create communicator. + +)DOC"); + } +}; + +// AllReduceOp +class NCCLAllReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of AllReduce op input should not be NULL"); + PADDLE_ENFORCE( + ctx->HasInput("Communicator"), + " Input(Communicator) of AllReduce op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Input(X) of AllReduce op input should not be NULL"); + + auto x_dims = ctx->GetInputsDim("X"); + + std::string reduction = ctx->Attrs().Get("reduction"); + PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), + "invalid reduction."); + + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// ReduceOp +class NCCLReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Reduce op input should not be NULL"); + PADDLE_ENFORCE( + ctx->HasInput("Communicator"), + " Input(Communicator) of Reduce op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Input(X) of Reduce op input should not be NULL"); + + std::string reduction = ctx->Attrs().Get("reduction"); + PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || + reduction == "ncclMin" || reduction == "ncclMax"), + "invalid reduction."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// BcastOp +class NCCLBcastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Output(Out) of Bcast op output should not be NULL"); + + int root = ctx->Attrs().Get("root"); + PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +// AllreduceOp +class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of AllReduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of AllReduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddComment(R"DOC( +NCCLAllReduce Operator. + +AllReduce the input tensors. + +)DOC"); + } +}; + +// ReduceOp +class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of Reduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of Reduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddAttr("root", + "(int, default kInvalidGPUId) " + "Root gpu of the parameter. If not, " + "set(platform::kInvalidGPUId). Hashed by name.") + .SetDefault(platform::kInvalidGPUId); + AddComment(R"DOC( +NCCLReduce Operator. + +Reduce the tensors. + +)DOC"); + } +}; + +// BcastOp +class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of BcastSend op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of Bcast"); + AddAttr("root", + "(int, default kInvalidGPUId) " + "Root gpu of the parameter. If not, " + "set(platform::kInvalidGPUId). Hashed by name.") + .SetDefault(platform::kInvalidGPUId); + AddComment(R"DOC( +NCCLBcast Operator. + +Bcast the tensors. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp, + paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker); + +REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp, + ops::NCCLAllReduceOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp, + ops::NCCLBcastOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp, + ops::NCCLReduceOpMaker); diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..333aed2903e7873aa799bd34468b2e05ef2e556c --- /dev/null +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenseshashernless required by applicable law or agreed +to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::Communicator; +using framework::LoDTensor; + +template +class NCCLTypeWrapper; + +template <> +class NCCLTypeWrapper { + public: + static const ncclDataType_t type = ncclFloat; +}; + +template <> +class NCCLTypeWrapper { + public: + static const ncclDataType_t type = ncclDouble; +}; + +template +class NCCLAllReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + auto ins = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + + std::string reduction = ctx.Attr("reduction"); + ncclRedOp_t reduction_op_ = ncclSum; + + if (reduction == "ncclMin") { + reduction_op_ = ncclMin; + } else if (reduction == "ncclMax") { + reduction_op_ = ncclMax; + } else if (reduction == "ncclSum") { + reduction_op_ = ncclSum; + } else if (reduction == "ncclProd") { + reduction_op_ = ncclProd; + } else { + PADDLE_THROW("Invalid reduction. default ncclSum."); + } + + auto* comm = ctx.Input("Communicator"); + + auto stream = ctx.cuda_device_context().stream(); + + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + for (size_t i = 0; i < ins.size(); ++i) { + VLOG(1) << "gpu : " + << " invoke allreduce. send " << ins[i]->numel() << " recv " + << outs[i]->numel(); + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + ins[i]->data(), outs[i]->mutable_data(ctx.GetPlace()), + outs[i]->numel(), NCCLTypeWrapper::type, reduction_op_, + comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " + << " finished allreduce. send " << ins[i]->numel() << " recv " + << outs[i]->numel(); + } + } +}; + +template +class NCCLReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + auto ins = ctx.MultiInput("X"); // x0, x1, x2 + auto outs = ctx.MultiOutput("Out"); + + std::string reduction = ctx.Attr("reduction"); + ncclRedOp_t reduction_op_ = ncclSum; + + if (reduction == "ncclMin") { + reduction_op_ = ncclMin; + } else if (reduction == "ncclMax") { + reduction_op_ = ncclMax; + } else if (reduction == "ncclSum") { + reduction_op_ = ncclSum; + } else if (reduction == "ncclProd") { + reduction_op_ = ncclProd; + } else { + PADDLE_THROW("Invalid reduction. default ncclSum."); + } + + int root = ctx.Attr("root"); + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + auto ins_names = ctx.Inputs("X"); + std::hash hasher; + for (size_t i = 0; i < ins.size(); ++i) { + if (root == platform::kInvalidGPUId) { + root = hasher(ins_names[i]) % comm->comms_.size(); + } + T* recvbuffer = nullptr; + if (root == gpu_id) { + recvbuffer = outs[i]->mutable_data(ctx.GetPlace()); + } + + VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send " + << ins[i]->numel() << " recv " << outs[i]->numel(); + + PADDLE_ENFORCE(platform::dynload::ncclReduce( + ins[i]->data(), recvbuffer, ins[i]->numel(), + NCCLTypeWrapper::type, reduction_op_, root, comm->comms_[idx], + stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished reduce. send " + << ins[i]->numel() << " recv " << outs[i]->numel(); + } + } +}; + +template +class NCCLBcastKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + + int root = ctx.Attr("root"); + + auto* comm = ctx.Input("Communicator"); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + // device id + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int idx = comm->GetCommId(gpu_id); + + if (idx == root) { + auto ins = ctx.MultiInput("X"); + for (size_t i = 0; i < ins.size(); ++i) { + VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send " + << ins[i]->numel(); + + VLOG(1) << " before ncclBcast"; + PADDLE_ENFORCE(platform::dynload::ncclBcast( + (void*)ins[i]->data(), ins[i]->numel(), NCCLTypeWrapper::type, + root, comm->comms_[idx], stream)); + VLOG(1) << " after ncclBcast"; + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished Bcast."; + } + } else { + auto outs = ctx.MultiOutput("Out"); + for (size_t i = 0; i < outs.size(); ++i) { + VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " + << framework::product(outs[i]->dims()); + + PADDLE_ENFORCE(platform::dynload::ncclBcast( + outs[i]->mutable_data(ctx.GetPlace()), outs[i]->numel(), + NCCLTypeWrapper::type, root, comm->comms_[idx], stream)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream)); + + VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv " + << outs[i]->numel(); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel); +REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel); diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..212ed2f9b63de5061dc3a2eb86508d8a4c305f89 --- /dev/null +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -0,0 +1,318 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +USE_NO_KERNEL_OP(ncclInit); +USE_CUDA_ONLY_OP(ncclAllReduce); +USE_CUDA_ONLY_OP(ncclReduce); +USE_CUDA_ONLY_OP(ncclBcast); + +namespace f = paddle::framework; +namespace p = paddle::platform; + +static std::vector gpu_list; + +// test data amount +const f::DDim kDims = {100, 100}; + +// nccl op common tester, init communicator. +class NCCLTester : public ::testing::Test { + public: + virtual void SetUp() override { + paddle::platform::CPUPlace cpu_place; + for (size_t i = 0; i < gpu_list.size(); ++i) { + p::CUDAPlace place(i); + dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); + } + + NCCLInitOp(); + } + + virtual void TearDown() override { + for (auto &device_context : dev_ctxs) { + delete device_context; + } + } + + void NCCLInitOp() { + paddle::platform::CPUPlace cpu_place; + std::unique_ptr op1(new f::OpDesc); + + op1->SetType("ncclInit"); + op1->SetOutput("Communicator", {"comm"}); + op1->SetAttr("gpus", {gpu_list}); + + auto *var = g_scope.Var("comm"); + var->GetMutable(); + + auto op = f::OpRegistry::CreateOp(*op1); + VLOG(1) << "invoke NCCLInitOp."; + op->Run(g_scope, cpu_place); + VLOG(1) << "NCCLInitOp finished."; + } + + template + void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { + std::unique_lock lk(mu); + const f::OpDesc *op1 = &op_desc; + + p::CUDAPlace place(gpu_id); + auto &ctx = dev_ctxs.at(gpu_id); + + auto *send_tensor = scope->Var("st")->GetMutable(); + auto *recv_tensor = scope->Var("rt")->GetMutable(); + + if (!send_tensor->numel()) { + send_tensor->Resize(kDims); + send_tensor->mutable_data(kDims, place); + + std::vector send_vector(f::product(kDims), gpu_id); + paddle::framework::CopyFromVector(send_vector, *ctx, send_tensor); + ctx->Wait(); + VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); + } + + lk.unlock(); + + PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims), + "Tensor numel not match!"); + + auto op = f::OpRegistry::CreateOp(*op1); + + VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); + VLOG(1) << " send_tensor : " << send_tensor->numel() + << " recv_tensor : " << recv_tensor->numel(); + op->Run(*scope, place); + VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); + } + + public: + std::vector dev_ctxs; + f::Scope g_scope; + std::mutex mu; +}; + +// ncclInitOp with desc +TEST(NCCL, ncclInitOp) { + std::unique_ptr op_desc(new f::OpDesc); + + op_desc->SetType("ncclInit"); + op_desc->SetOutput("Communicator", {"x1"}); + op_desc->SetAttr("gpus", {gpu_list}); + + f::Scope g_scope; + paddle::platform::CPUPlace cpu_place; + + auto *var = g_scope.Var("x1"); + var->GetMutable(); + + auto op = f::OpRegistry::CreateOp(*op_desc); + VLOG(1) << "invoke NCCLInitOp."; + op->Run(g_scope, cpu_place); + VLOG(1) << "NCCLInitOp finished."; +} + +// ncclAllReduceOp with desc +TEST_F(NCCLTester, ncclAllReduceOp) { + std::unique_ptr op2(new f::OpDesc); + op2->SetType("ncclAllReduce"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + // check results + float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + + for (size_t i = 0; i < dev_scopes.size(); ++i) { + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[i]); + + auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[i])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } + } +} + +// ncclReduceOp with desc +TEST_F(NCCLTester, ncclReduceOp) { + std::unique_ptr op2(new f::OpDesc); + const int kRoot = 0; + op2->SetType("ncclReduce"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + op2->SetAttr("root", kRoot); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + // check results on + float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[kRoot]); + + auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = + dev_scopes[kRoot]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[kRoot])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } +} + +// ncclBcastOp with desc +TEST_F(NCCLTester, ncclBcastOp) { + std::unique_ptr op2(new f::OpDesc); + const int kRoot = 0; + op2->SetType("ncclBcast"); + op2->SetInput("X", {"st"}); + op2->SetInput("Communicator", {"comm"}); + op2->SetOutput("Out", {"rt"}); + op2->SetAttr("root", kRoot); + + std::vector dev_scopes; + + std::vector ths; + + for (size_t i = 0; i < gpu_list.size(); ++i) { + dev_scopes.emplace_back(&g_scope.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + *op2.get(), dev_scopes[i]); + ths.emplace_back(std::move(th)); + } + + for (size_t i = 0; i < gpu_list.size(); ++i) { + ths[i].join(); + } + + const int idx = 1; + // check results on + float result = kRoot; + + p::CPUPlace cpu_place; + p::CUDAPlace gpu_place(gpu_list[idx]); + + auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); + auto *rt = recv_tensor.data(); + auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable(); + result_tensor->Resize(kDims); + auto *ct = result_tensor->mutable_data(cpu_place); + + paddle::memory::Copy( + cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, + recv_tensor.numel() * sizeof(float), + static_cast(dev_ctxs[idx])->stream()); + + for (int64_t j = 0; j < f::product(kDims); ++j) { + ASSERT_NEAR(ct[j], result, 1e-5); + } +} + +int main(int argc, char **argv) { + // FIXME(tonyyang-svail): + // Due to the driver issue on our CI, disable for now + return 0; + const int dev_count = p::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << dev_count; + return 0; + } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + gpu_list.emplace_back(i); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + + // device context should be release before scope. + // otherwise driver will down. + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0841313a1042ea4099473fdc0293d9bae4a7c8c3 --- /dev/null +++ b/paddle/fluid/operators/nce_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/nce_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class NCEOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input")); + PADDLE_ENFORCE(ctx->HasInput("Label")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasOutput("Cost")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLabels")); + + auto x_dims = ctx->GetInputDim("Input"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); + int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0], + ctx->GetInputDim("Bias")[0]); + } + auto num_neg_samples = ctx->Attrs().Get("num_neg_samples"); + auto num_total_classes = ctx->Attrs().Get("num_total_classes"); + std::vector custom_neg_classes = + ctx->Attrs().Get>("custom_neg_classes"); + PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]); + if (custom_neg_classes.size() > 0) { + PADDLE_ENFORCE_EQ(custom_neg_classes.size(), + static_cast(num_neg_samples)); + } + // set dims of output(Out) + std::vector out_dims; + out_dims.push_back(x_dims[0]); + out_dims.push_back(1); + ctx->SetOutputDim("Cost", framework::make_ddim(out_dims)); + + // set dims of output(SampleOut) + std::vector sample_out_dims; + sample_out_dims.push_back(x_dims[0]); + sample_out_dims.push_back(num_neg_samples + num_true_classes); + ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims)); + ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +class NCEOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim]."); + AddInput( + "Label", + "(Tensor) A tensor of shape [batch_size, num_true_class]. " + "'num_true_class' is the number of target classes in each sample." + "The number of target classes per sample should be same. " + "If you have a variable number of target classes, " + "you can pad them out to a constant number by either repeating them" + " or by padding with an otherwise unused class.)"); + AddInput("Weight", + "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the " + "total number of class."); + AddInput( + "Bias", + "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total " + "number of class. It is a dispensable input.") + .AsDispensable(); + AddInput("SampleWeight", + "(Tensor) A tensor of shape [batch_size, 1] storing a weight for " + "each sample. And it is a dispensable input. The default value of " + "sample is 1.") + .AsDispensable(); + AddOutput("Cost", + "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples."); + AddOutput("SampleLogits", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "Given X is the dot product of input tensor and sampled labels' " + "weights." + "Then 'SampleLogits' is sigmoid(X).") + .AsIntermediate(); + AddOutput("SampleLabels", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "") + .AsIntermediate(); + AddAttr("num_total_classes", + "Total number of classes in all samples."); + AddAttr("num_neg_samples", + "The number of negative classes. The default value is 10.") + .SetDefault(10); + AddAttr>("custom_neg_classes", + "This attribute only be used in unitest. Classes " + "in this list wiil be used as negative classes " + "for every samples. Under normal conditions, " + "user should avoid setting this attribute.") + .SetDefault({}); + AddComment(R"DOC( +Compute and return the noise-contrastive estimation training loss. +See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). +By default this operator uses a uniform distribution for sampling. +)DOC"); + } +}; + +class NCEOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasInput("Cost")); + PADDLE_ENFORCE(ctx->HasInput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasInput("SampleLabels")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")), + "The input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto w_dims = ctx->GetInputDim("Weight"); + auto w_grad_name = framework::GradVarName("Weight"); + if (ctx->HasOutput(w_grad_name)) { + ctx->SetOutputDim(w_grad_name, w_dims); + } + + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) { + auto bias_dims = ctx->GetInputDim("Bias"); + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad); +REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, + ops::NCEKernel); +REGISTER_OP_CPU_KERNEL(nce_grad, + ops::NCEGradKernel, + ops::NCEGradKernel); diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h new file mode 100644 index 0000000000000000000000000000000000000000..624c2d9bbd3245a11c8cfff2dd7cae6e5b25f106 --- /dev/null +++ b/paddle/fluid/operators/nce_op.h @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "unsupported/Eigen/CXX11/Tensor" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +void PrepareSamples(const framework::ExecutionContext& context) { + auto label = context.Input("Label"); + const int64_t* label_data = label->data(); + auto label_dims = label->dims(); + int num_total_classes = context.Attr("num_total_classes"); + // for unitest + std::vector custom_neg_classes = + context.Attr>("custom_neg_classes"); + // random machine + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_int_distribution rand(0, num_total_classes - 1); + + auto sample_labels = context.Output("SampleLabels"); + auto sample_labels_dims = sample_labels->dims(); + int64_t* sample_labels_data = + sample_labels->mutable_data(context.GetPlace()); + + int num_label = label_dims.size() == 2 ? label_dims[1] : 1; + int index = 0; + for (int64_t i = 0; i < label_dims[0]; ++i) { + int j = 0; + for (; j < num_label; ++j) { + sample_labels_data[index++] = label_data[i * num_label + j]; + } + if (custom_neg_classes.size() > 0) { + for (auto label : custom_neg_classes) { + sample_labels_data[index++] = label; + } + } else { + for (; j < sample_labels_dims[1]; ++j) { + // TODO(wanghaoshuang): support more distribution sampling + sample_labels_data[index++] = rand(rng); + } + } + } +} + +template +class NCEKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PrepareSamples(context); + auto sample_labels = context.Output("SampleLabels"); + const int64_t* sample_labels_data = sample_labels->data(); + auto sample_out = context.Output("SampleLogits"); + T* sample_out_data = sample_out->mutable_data(context.GetPlace()); + auto label = context.Input("Label"); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + auto out = context.Output("Cost"); + T* out_data = out->mutable_data(context.GetPlace()); + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); + int64_t num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_total_classes * num_neg_samples; + // forward bias + auto bias = context.Input("Bias"); + if (bias != nullptr) { + const T* bias_data = bias->data(); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = bias_data[sample_labels_data[i]]; + } + } else { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = 0; + } + } + // forward mul + auto input_mat = EigenMatrix::From(*(context.Input("Input"))); + auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } + // forward cost + for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { + int64_t j = 0; + out_data[i] = 0; + T w = sample_weight == nullptr ? 1. : sample_weight_data[i]; + // for true classes + for (; j < num_true_class; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(o / (o + b)); + out_data[i] += w * cost; + } + // for sampled neg classes + for (; j < sample_labels->dims()[1]; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(b / (o + b)); + out_data[i] += w * cost; + } + } + } +}; + +template +class NCEGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto d_out = context.Input(framework::GradVarName("Cost")); + const T* d_out_data = d_out->data(); + auto label = context.Input("Label"); + auto sample_out = context.Input("SampleLogits"); + const T* sample_out_data = sample_out->data(); + auto sample_labels = context.Input("SampleLabels"); + const int64_t* sample_labels_data = sample_labels->data(); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); + int num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_total_classes * num_neg_samples; + Tensor sample_grad; // tmp tensor + T* sample_grad_data = + sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); + // backward cost + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + T o = sample_out_data[i]; + T w = sample_weight == nullptr + ? 1 + : sample_weight_data[i / sample_labels->dims()[1]]; + sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class + ? w * (b / (o + b)) * (o - 1) + : w * (o * (1 - o) / (o + b)); + sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]]; + } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T* d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + // get d_w + auto d_w = context.Output(framework::GradVarName("Weight")); + if (d_w != nullptr) { + auto d_w_data = d_w->mutable_data(context.GetPlace()); + std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); + auto d_w_matrix = EigenMatrix::From(*d_w); + auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_w_matrix.chip(sample_labels_data[i], 0) += + x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) * + sample_grad_data[i]; + } + } + // get d_x + auto d_x = context.Output(framework::GradVarName("Input")); + if (d_x != nullptr) { + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + std::fill(d_x_data, d_x_data + d_x->numel(), 0.0); + auto d_x_matrix = EigenMatrix::From(*d_x); + auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) += + w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c0ca5873adcc92a39f20a162796e7581ea10c63f --- /dev/null +++ b/paddle/fluid/operators/net_op.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/net_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +const char NetOp::kAll[] = "all"; + +void NetOp::CompleteAddOp(bool calc) { + add_op_done_ = true; + if (!calc) return; + std::set input_set; + std::set output_set; + for (auto& op : ops_) { + for (auto& ipt : op->Inputs()) { + for (auto& var_name : ipt.second) { + // If input variable has been in output set, then it will be + // added into intermediate_outputs_. Otherwise, it will be + // added into input set. + if (Contains(output_set, var_name)) { + intermediate_outputs_.insert(var_name); + } else { + input_set.insert(var_name); + } + } + } + + for (auto& opt : op->Outputs()) { + for (auto& var_name : opt.second) { + output_set.insert(var_name); + } + } + } + auto& inputs = inputs_[kAll]; + inputs.reserve(input_set.size()); + std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs)); + auto& outputs = outputs_[kAll]; + outputs.reserve(output_set.size()); + std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs)); +} + +std::string NetOp::DebugStringEx(const framework::Scope* scope) const { + std::ostringstream os; + os << OperatorBase::DebugStringEx(scope) << std::endl; + for (auto& op : ops_) { + std::istringstream is(op->DebugStringEx(scope)); + for (std::string line; std::getline(is, line);) { + os << " " << line << std::endl; + } + } + return os.str(); +} + +bool NetOp::IsNetOp() const { return true; } + +std::vector NetOp::OutputVars(bool has_intermediate) const { + std::vector all; + for (auto& pair : this->outputs_) { + for (auto& var_name : pair.second) { + all.push_back(var_name); + } + } + if (has_intermediate) { + return all; + } + std::vector ret_val; + for (auto& each : all) { + if (!Contains(intermediate_outputs_, each)) { + ret_val.push_back(each); + } + } + return ret_val; +} + +NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + +std::unique_ptr NetOp::Clone() const { + PADDLE_ENFORCE( + add_op_done_, + "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone"); + return std::unique_ptr(new NetOp(*this)); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h new file mode 100644 index 0000000000000000000000000000000000000000..14e5909851c4ac08b5f59c5c193c801827b91234 --- /dev/null +++ b/paddle/fluid/operators/net_op.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +/** + * @brief Network is also a type of Operator + * + * It will manage the operators it has. + * + * Network is the container and controller of a set of operators. + + * A network object knows all Operators belonging to this network. Variables, + * which are inputs and outputs of these operators, are created and managed by a + * hierarchy of Scope objects. + * + * This is the base class of network, all the networks should implement the APIs + * it defines. + */ +class NetOp : public framework::OperatorBase { + public: + static const char kAll[]; + NetOp() + : framework::OperatorBase("plain_net", framework::VariableNameMap{}, + framework::VariableNameMap{}, + framework::AttributeMap{}) {} + + NetOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs); + + NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) { + this->ops_.reserve(o.ops_.size()); + std::transform( + o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_), + [](const std::unique_ptr& op) { + return std::unique_ptr(op->Clone()); + }); + this->CompleteAddOp(); + } + + /** + * @brief Run the network. + * + * Run all the operators with the `scope`, if no scope is provided, default + * scope will be used instead. If no OpContext is provicded, default context + * will be used. + */ + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + for (auto& op : ops_) { + op->Run(scope, place); + } + } + + bool SupportGPU() const override { + for (auto& op : ops_) { + if (!op->SupportGPU()) { + return false; + } + } + return true; + } + + void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); } + + /** + * @brief Add an operator by ptr + */ + void AppendOp(std::unique_ptr op) { + PADDLE_ENFORCE(!add_op_done_, + "Cannot AppendOp when this network is sealed"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); + ops_.push_back(std::move(op)); + } + + void InsertOp(size_t pos, std::unique_ptr op) { + PADDLE_ENFORCE(!add_op_done_, + "Cannot InsertOp when this network is sealed"); + PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); + PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range"); + ops_.insert(ops_.begin() + pos, std::move(op)); + } + + void InsertOp(size_t pos, const framework::OperatorBase& op) { + InsertOp(pos, op.Clone()); + } + + void CompleteAddOp(bool calculate = true); + + std::string DebugStringEx( + const framework::Scope* scope = nullptr) const override; + + bool IsNetOp() const override; + std::vector OutputVars(bool has_intermediate) const override; + + std::unique_ptr Clone() const override; + + std::vector> ops_; + + private: + bool add_op_done_{false}; + std::set intermediate_outputs_; + + template + static bool Contains(T container, KeyType key) { + return container.find(key) != container.end(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc20be0c81763abe2adcf09de858ce51e16d77a6 --- /dev/null +++ b/paddle/fluid/operators/net_op_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/operators/net_op.h" + +#include + +namespace paddle { +namespace operators { +using Scope = framework::Scope; +using DeviceContext = platform::DeviceContext; + +static int run_cnt = 0; + +class TestOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + DEFINE_OP_CLONE_METHOD(TestOp); + void Run(const Scope& scope, const platform::Place& place) const override { + ++run_cnt; + } +}; + +template +void AssertSameVectorWithoutOrder(const std::vector& expected, + const std::vector& actual) { + ASSERT_EQ(expected.size(), actual.size()); + std::unordered_set expected_set; + for (auto& tmp : expected) { + expected_set.insert(tmp); + } + for (auto& act : actual) { + ASSERT_NE(expected_set.end(), expected_set.find(act)); + } +} + +TEST(OpKernel, all) { + auto net = std::make_shared(); + ASSERT_NE(net, nullptr); + + net->AppendOp(std::unique_ptr( + new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"Out", {"y"}}}, framework::AttributeMap{}))); + net->AppendOp(std::unique_ptr( + new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}}, + {{"Out", {"z"}}}, framework::AttributeMap{}))); + + net->CompleteAddOp(); + AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, + net->Inputs(NetOp::kAll)); + AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll)); + + auto final_outs = net->OutputVars(false); + + ASSERT_EQ(final_outs.size(), 1UL); + ASSERT_EQ(final_outs[0], "z"); +} + +TEST(NetOp, insert_op) { + NetOp net; + auto op1 = std::unique_ptr( + new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, + {{"Out", {"y"}}}, framework::AttributeMap{})); + net.AppendOp(*op1); + net.InsertOp(0, *op1); + ASSERT_EQ(2UL, net.ops_.size()); + net.InsertOp(2, std::move(op1)); + ASSERT_EQ(3UL, net.ops_.size()); +} + +TEST(NetOp, Clone) { + NetOp net; + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty2", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.CompleteAddOp(true); + auto new_net_op = net.Clone(); + ASSERT_NE(new_net_op, nullptr); + ASSERT_TRUE(new_net_op->IsNetOp()); + auto* new_net = static_cast(new_net_op.get()); + ASSERT_EQ(2UL, new_net->ops_.size()); + ASSERT_EQ(new_net->ops_[0]->Type(), "empty"); + ASSERT_EQ(new_net->ops_[1]->Type(), "empty2"); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee85b1a90a85f8c6ec57900c4c7d0dd319a0186a --- /dev/null +++ b/paddle/fluid/operators/norm_op.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/norm_op.h" +namespace paddle { +namespace operators { + +template +class NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput("Scale", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is C * 1."); + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") + .SetDefault(1.0e-10f); + AddOutput("Out", + "(Tensor) The output tensor of norm operator." + "N * M." + "M = C * H * W"); + AddComment(R"DOC( + "Input shape: $(N, C, H, W)$ + Scale shape: $(C, 1)$ + Output shape: $(N, C, H, W)$ + Where + forward + $$ + [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}] + $$ + backward + $$ + \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}} + $$ + )DOC"); + } +}; + +class NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of NormOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of NormOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of NormOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", in_x_dims); + } +}; + +class NormOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, + ops::NormOpGrad); +REGISTER_OP_CPU_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CPU_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..438bb3b86e79c526f70a23d3c7f6cc13f72e0463 --- /dev/null +++ b/paddle/fluid/operators/norm_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CUDA_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..db74c9b02a74afc6bd0e59da97e64a3e556b97dc --- /dev/null +++ b/paddle/fluid/operators/norm_op.h @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + auto* out = context.Output("Out"); + auto epsilon = static_cast(context.Attr("epsilon")); + out->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor out_batch = out->Slice(n, n + 1); + auto out_batch_eigen = + framework::EigenMatrix::From( + out_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp = framework::EigenVector::Flatten(tmp_tensor); + // get colsum and sqrt , inverse + auto dim = Eigen::array({{0}}); + tmp.device(*place) = x_square_batch_eigen.sum(dim); + tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + out_batch_eigen.device(*place) = + in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col)); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + out_batch_eigen.device(*place) = + out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +template +class NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + auto epsilon = static_cast(context.Attr("epsilon")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + in_x_grad->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); + auto in_g_batch_eigen = + framework::EigenMatrix::From( + in_g_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor outg_batch = out_grad->Slice(n, n + 1); + auto outg_batch_eigen = + framework::EigenMatrix::From( + outg_batch, framework::make_ddim({channels, fea_len})); + + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp_eigen = + framework::EigenVector::Flatten(tmp_tensor); + auto dim = Eigen::array({{0}}); + tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); + framework::Tensor norm_tmp_tensor; + norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto norm_tmp_eigen = + framework::EigenVector::Flatten(norm_tmp_tensor); + norm_tmp_eigen.device(*place) = + (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + in_g_batch_eigen.device(*place) = + in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / + (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen; + // outg_batch_eigen + (in_g_batch_eigen * -1); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + in_g_batch_eigen.device(*place) = + in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c3a60da729d6f4edb8e7e3aa0c81cd3140855c0 --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_op.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace operators { + +class OneHotOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of OneHotOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of OneHotOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) should be at least 2."); + PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U, + "Last dimension of Input(X) should be 1."); + + int depth = ctx->Attrs().Get("depth"); + + PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth); + + framework::DDim out_dims(x_dims); + out_dims[out_dims.size() - 1] = depth; + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /* --> */ "Out"); + } +}; + +class OneHotOpMaker : public framework::OpProtoAndCheckerMaker { + public: + OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, LoDTensor) Input variable with rank at least 2. " + "The last dimension of X should be 1. Each value of X is an index " + "to indicate the position."); + AddOutput("Out", + "(Tensor, Tensor) Output tensor with same rank as X. " + "The tensor consists of one-hot representations of values in X."); + AddAttr("depth", + "A positive integer to specify the length of one-hot vector."); + AddAttr("dtype", + "An integer to specify the data type of one-hot " + "vector. The default value is FP32.") + .SetDefault(paddle::framework::proto::DataType::FP32); + AddComment(R"DOC( +One Hot Operator. This operator creates the one-hot representations for input +index values. The following example will help to explain the function of this +operator: + +X is a LoDTensor: + X.lod = [[0, 1, 4]] + X.shape = [4, 1] + X.data = [[1], [1], [3], [0]] + +set depth = 4 + +Out is a LoDTensor: + Out.lod = [[0, 1, 4]] + Out.shape = [4, 4] + Out.data = [[0., 1., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 0., 1.], + [1., 0., 0., 0.]] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + one_hot, ops::OneHotKernel, + ops::OneHotKernel); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a8061edaab61661f57b17cd5e065c5c84edb906 --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.cu @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_op.h" +#include "paddle/fluid/platform/cuda_helper.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, + const int64_t numel, const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotOpCUDAFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + auto stream = ctx_.stream(); + math::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + p_in_data, p_out_data, numel, depth_); + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpCUDAFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + one_hot, ops::OneHotCUDAKernel, + ops::OneHotCUDAKernel); diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ddac6edd0ec73a932dcf9c6ca7d7d63853467f1c --- /dev/null +++ b/paddle/fluid/operators/one_hot_op.h @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct OneHotOpFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + int depth_; + const DeviceContext& ctx_; + + OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + math::set_constant(ctx_, out_, 0.0); + + for (int i = 0; i < numel; ++i) { + PADDLE_ENFORCE_GE(p_in_data[i], 0, + "Illegal index value, should be at least 0."); + PADDLE_ENFORCE_LT(p_in_data[i], depth_, + "Illegal index value, should be less than depth (%d).", + depth_); + *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; + } + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/fluid/operators/op_documentation/batch_norm_op.md similarity index 100% rename from paddle/operators/op_documentation/batch_norm_op.md rename to paddle/fluid/operators/op_documentation/batch_norm_op.md diff --git a/paddle/operators/op_documentation/name_convention.md b/paddle/fluid/operators/op_documentation/name_convention.md similarity index 100% rename from paddle/operators/op_documentation/name_convention.md rename to paddle/fluid/operators/op_documentation/name_convention.md diff --git a/paddle/operators/op_documentation/net_op_design.md b/paddle/fluid/operators/op_documentation/net_op_design.md similarity index 100% rename from paddle/operators/op_documentation/net_op_design.md rename to paddle/fluid/operators/op_documentation/net_op_design.md diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/fluid/operators/op_documentation/op_markdown_format.md similarity index 100% rename from paddle/operators/op_documentation/op_markdown_format.md rename to paddle/fluid/operators/op_documentation/op_markdown_format.md diff --git a/paddle/operators/op_documentation/rnn_design.md b/paddle/fluid/operators/op_documentation/rnn_design.md similarity index 100% rename from paddle/operators/op_documentation/rnn_design.md rename to paddle/fluid/operators/op_documentation/rnn_design.md diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b021fde7cba699327e1874d569a14c3139a4c32 --- /dev/null +++ b/paddle/fluid/operators/pad_op.cc @@ -0,0 +1,140 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pad_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class PadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PadOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), + "Size of paddings should be equal to 2 * dimension size " + "of input tensor."); + std::vector out_dims(x_dim.size()); + for (int i = 0; i < x_dim.size(); ++i) { + out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (out_dims[0] == x_dim[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class PadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PadOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddOutput("Out", + "The output of pad op. " + "A tensor with the same shape as X."); + AddAttr>( + "paddings", + "(vector) " + "A list to describe the padding rules for each dimension. " + "For 2-D image tensor, paddings=[0, 1, 2, 3] means " + "padding 0 row to top, 1 row to bottom, 2 columns to left " + "and 3 columns to right. Size of paddings should be equal to " + "2 * dimension size of the input tensor."); + AddAttr("pad_value", + "(float, default 0.0) " + "The value to fill the padded areas.") + .SetDefault(0.0f); + AddComment(R"DOC( +Pad Operator. + +Pad input into output, as specified by paddings and pad_value. +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Given: + +X = [[1, 2], + [3, 4]], + +paddings = [0, 1, 1, 2], + +and + +pad_value = 0, + +we have: + +Out = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]] + +)DOC"); + } +}; + +class PadOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +class PadOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); + bind->SetInput("X", Input("X")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); + bind->SetAttrMap(Attrs()); + bind->SetType("pad_grad"); + return std::unique_ptr(bind); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker); +REGISTER_OPERATOR(pad_grad, ops::PadOpGrad); +REGISTER_OP_CPU_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..203c31440371440b5e942452dab08978e2136275 --- /dev/null +++ b/paddle/fluid/operators/pad_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/pad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..244d8f9b6cf51ab249e991ed129e99da67ff9e62 --- /dev/null +++ b/paddle/fluid/operators/pad_op.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenTensor = framework::EigenTensor; + +template +void PadFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = pads[i * 2]; + paddings[i].second = pads[i * 2 + 1]; + } + T pad_value = context.Attr("pad_value"); + + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.pad(paddings, pad_value); +} + +template +class PadKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + PadFunction(context); + break; + case 2: + PadFunction(context); + break; + case 3: + PadFunction(context); + break; + case 4: + PadFunction(context); + break; + case 5: + PadFunction(context); + break; + case 6: + PadFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +template +void PadGradFunction(const framework::ExecutionContext& context) { + auto pads = context.Attr>("paddings"); + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = -pads[i * 2]; + paddings[i].second = -pads[i * 2 + 1]; + } + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + d_x->mutable_data(context.GetPlace()); + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + auto& place = + *context.template device_context().eigen_device(); + d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); + } +} + +template +class PadGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + PadGradFunction(context); + break; + case 2: + PadGradFunction(context); + break; + case 3: + PadGradFunction(context); + break; + case 4: + PadGradFunction(context); + break; + case 5: + PadGradFunction(context); + break; + case 6: + PadGradFunction(context); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e25df92479943d210d98f02374f377f778f43d2c --- /dev/null +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -0,0 +1,378 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +static constexpr char kInputs[] = "inputs"; +static constexpr char kParameters[] = "parameters"; +static constexpr char kPlaces[] = "places"; + +static constexpr char kOutputs[] = "outputs"; +static constexpr char kParallelScopes[] = "parallel_scopes"; + +static constexpr char kParallelBlock[] = "sub_block"; + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +static void SplitTensorAndMoveTensorToScopes( + const framework::Scope &scope, std::vector *sub_scopes, + const std::vector &places, + const std::vector &names) { + size_t num_sub_scopes = 0; + for (auto &argu : names) { + const auto &tensor = + detail::Ref(scope.FindVar(argu), + "Cannot find variable %s in the parent scope", argu) + .Get(); + auto lod_tensors = tensor.SplitLoDTensor(places); + + for (auto &lod : lod_tensors) { + VLOG(3) << lod.dims(); + } + if (num_sub_scopes == 0) { + num_sub_scopes = lod_tensors.size(); + } else { + PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size()); + } + PADDLE_ENFORCE_NE(num_sub_scopes, 0); + if (sub_scopes->size() == 0) { + sub_scopes->reserve(num_sub_scopes); + for (size_t i = 0; i < num_sub_scopes; ++i) { + sub_scopes->emplace_back(&scope.NewScope()); + } + } + + for (size_t i = 0; i < lod_tensors.size(); ++i) { + *detail::Ref(sub_scopes->at(i)->Var(argu), + "Cannot find variable in the sub-scope", argu) + .GetMutable() = lod_tensors[i]; + } + } +} + +inline void CopyOrShare(const framework::Variable &src, + const platform::Place &dst_place, + framework::Variable *dst) { + if (src.IsType()) { + if (src.Get().place() == dst_place) { + dst->GetMutable()->ShareDataWith(src.Get()); + dst->GetMutable()->set_lod(src.Get().lod()); + } else { + Copy(src.Get(), dst_place, dst->GetMutable()); + } + } else if (src.IsType()) { + auto &src_sr = src.Get(); + auto *dst_sr = dst->GetMutable(); + dst_sr->set_height(src_sr.height()); + if (src_sr.value().place() == dst_place) { + dst_sr->mutable_value()->ShareDataWith(src_sr.value()); + dst_sr->set_rows(src_sr.rows()); + } else { + Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); + } + } else { + PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); + } +} + +void WaitOnPlace(const platform::Place place) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); +} + +void WaitOnPlaces(const std::vector places) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + + for (auto &place : places) { + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); + } +} + +class ParallelDoOp : public framework::OperatorBase { + public: + ParallelDoOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + auto *block = Attr(kParallelBlock); + auto *program = block->Program(); + + auto &places = scope.FindVar(Input(kPlaces))->Get(); + + auto &sub_scopes = *scope.FindVar(Output(kParallelScopes)) + ->GetMutable>(); + + // split input + SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places, + Inputs(kInputs)); + + // copy parameter + for (auto ¶m : Inputs(kParameters)) { + PADDLE_ENFORCE(scope.FindVar(param)->IsType(), + "Only support parameter type as LoDTensor"); + auto &src = scope.FindVar(param)->Get(); + for (size_t i = 0; i < sub_scopes.size(); ++i) { + auto &place = places[i]; + auto *sub_scope = sub_scopes[i]; + auto *dst = sub_scope->Var(param)->GetMutable(); + framework::Copy(src, place, dst); + } + } + WaitOnPlaces(places); + + std::vector> workers; + workers.reserve(places.size()); + for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { + auto &place = places[place_idx]; + auto *cur_scope = sub_scopes[place_idx]; + + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); + } + for (auto &worker : workers) { + worker.wait(); + } + WaitOnPlaces(places); + + // merge output + for (auto &o_name : Outputs(kOutputs)) { + std::vector lod_tensors; + lod_tensors.reserve(sub_scopes.size()); + for (auto *sub_scope : sub_scopes) { + lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get()); + } + + auto *lod_tensor_to_be_merged = + scope.FindVar(o_name)->GetMutable(); + lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace()); + } + WaitOnPlaces(places); + } +}; + +class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInputs, "").AsDuplicable(); + AddInput(kParameters, "").AsDuplicable(); + AddInput(kPlaces, ""); + AddOutput(kOutputs, "").AsDuplicable(); + AddOutput(kParallelScopes, ""); + AddAttr(kParallelBlock, ""); + AddComment(R"DOC( +ParallelDo Operator. +)DOC"); + } +}; + +class ParallelDoGradOp : public framework::OperatorBase { + public: + ParallelDoGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *block = Attr(kParallelBlock); + auto *program = block->Program(); + + auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) + ->Get>(); + + auto &places = scope.FindVar(Input(kPlaces))->Get(); + + // feed output@grad + SplitTensorAndMoveTensorToScopes( + scope, const_cast *>(&sub_scopes), + places, Inputs(framework::GradVarName(kOutputs))); + WaitOnPlaces(places); + + // exe run + std::vector> workers; + for (size_t i = 0; i < sub_scopes.size(); ++i) { + auto &place = places[i]; + auto *cur_scope = sub_scopes[i]; + + // execute + workers.emplace_back(framework::Async([program, cur_scope, place, block] { + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); + } + for (auto &worker : workers) { + worker.wait(); + } + WaitOnPlaces(places); + + AccumulateGrad(scope, place, sub_scopes, places); + } + + void AccumulateGrad(const framework::Scope &scope, + const platform::Place &place, + const std::vector &sub_scopes, + const platform::PlaceList &places) const { + for (auto &s : Outputs(framework::GradVarName(kParameters))) { + VLOG(3) << "Accumulating " << s; + if (s == framework::kEmptyVarName) continue; + std::string tmp_name; + auto *tmp = sub_scopes[0]->Var(&tmp_name); + + for (size_t i = 1; i < sub_scopes.size(); ++i) { + CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); + WaitOnPlaces(places); + + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, + framework::AttributeMap{}); + VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); + sum_op->Run(*sub_scopes[0], places[0]); + WaitOnPlace(places[0]); + } + + CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); + } + WaitOnPlaces(places); + } +}; + +std::ostream &operator<<(std::ostream &sout, + const std::vector &strs) { + std::copy(strs.begin(), strs.end(), + std::ostream_iterator(sout, ",")); + return sout; +} + +class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto *grad = new framework::OpDesc(); + grad->SetType("parallel_do_grad"); + for (auto &input_param : this->InputNames()) { + VLOG(3) << input_param; + grad->SetInput(input_param, this->Input(input_param)); + if (input_param != kPlaces) { + grad->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, false)); + } + } + auto *g_block = this->grad_block_[0]; + + // All variable name that needed by gradient operators + std::unordered_set all_inputs_in_grad_blocks; + + for (size_t i = 0; i < g_block->OpSize(); ++i) { + auto *op = g_block->Op(i); + for (auto &var_name : op->InputArgumentNames()) { + all_inputs_in_grad_blocks.insert(var_name); + } + } + + for (auto &output_param : this->OutputNames()) { + if (output_param == kParallelScopes) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->Output(output_param)); + } else { + grad->SetInput(output_param, this->Output(output_param)); + std::vector og_names; + for (auto &og_name : this->OutputGrad(output_param)) { + if (all_inputs_in_grad_blocks.count(og_name) != 0) { + // there are some gradient operators who need the OG. So make this + // OG as an input of parallel.do + og_names.push_back(og_name); + } + // else, there is no operator who need the OG. Do not use this OG as + // an input + } + grad->SetInput(framework::GradVarName(output_param), og_names); + } + } + grad->SetAttrMap(this->Attrs()); + grad->SetBlockAttr(kParallelBlock, *grad_block_[0]); + + return std::unique_ptr(grad); + } +}; + +class ParallelDoGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs(kParameters)); + PADDLE_ENFORCE(ctx->HasInputs(kInputs)); + PADDLE_ENFORCE(ctx->HasInputs(kOutputs)); + + ctx->SetOutputsDim(framework::GradVarName(kParameters), + ctx->GetInputsDim(kParameters)); + + auto i_dims = ctx->GetInputsDim(kInputs); + auto ig_names = ctx->Outputs(framework::GradVarName(kInputs)); + + for (size_t i = 0; i < ig_names.size(); ++i) { + auto &ig_name = ig_names[i]; + if (ig_name == framework::kEmptyVarName) { + continue; + } + + ctx->SetDims({ig_name}, {i_dims[i]}); + } + + auto p_dims = ctx->GetInputsDim(kParameters); + auto pg_names = ctx->Outputs(framework::GradVarName(kParameters)); + for (size_t i = 0; i < pg_names.size(); ++i) { + auto &pg_name = pg_names[i]; + if (pg_name == framework::kEmptyVarName) { + continue; + } + ctx->SetDims({pg_name}, {p_dims[i]}); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, + paddle::operators::ParallelDoOpProtoMaker, + paddle::operators::ParallelDoGradOpDescMaker); +REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, + paddle::operators::ParallelDoGradOpShapeInference); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..75984b7721c48526c6d11c4b82004dba1c166cc4 --- /dev/null +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; +using DataLayout = platform::DataLayout; +using PoolingMode = platform::PoolingMode; + +template +class PoolCUDNNOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + const Tensor *input = ctx.Input("X"); + Tensor *output = ctx.Output("Out"); + + const T *input_data = input->data(); + T *output_data = output->mutable_data(ctx.GetPlace()); + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = PoolingMode::kAverage; + } + + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, ksize, paddings, strides); + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cuda_device_context().cudnn_handle(); + T alpha = 1.0f, beta = 0.0f; + + PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( + handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, + cudnn_output_desc, output_data)); + } +}; + +template +class PoolCUDNNGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + + const Tensor *input = ctx.Input("X"); + const Tensor *output = ctx.Input("Out"); + const Tensor *output_grad = + ctx.Input(framework::GradVarName("Out")); + Tensor *input_grad = ctx.Output(framework::GradVarName("X")); + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + const T *input_data = input->data(); + const T *output_data = output->data(); + const T *output_grad_data = output_grad->data(); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = PoolingMode::kAverage; + } + + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, ksize, paddings, strides); + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cuda_device_context().cudnn_handle(); + T alpha = 1.0f, beta = 0.0f; + + if (input_grad) { + T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + + PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward( + handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, + cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, + &beta, cudnn_input_desc, input_grad_data)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); + +REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNGradOpKernel, + ops::PoolCUDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9dd33eefc5fd18ba08dce5ea8dff791cda54332c --- /dev/null +++ b/paddle/fluid/operators/pool_op.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" + +namespace paddle { +namespace operators { + +int OutputSizePool(int input_size, int filter_size, int padding, int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +void PoolOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of Pooling should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + + std::string pooling_type = ctx->Attrs().Get("pooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Pooling intput should be 4-D or 5-D tensor."); + + if (ctx->Attrs().Get("global_pooling")) { + ksize.resize(static_cast(in_x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_dims[i + 2]); + } + } + + PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, + "Input size and pooling size should be consistent."); + PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), + "Strides size and pooling size should be the same."); + PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), + "Paddings size and pooling size should be the same."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->ShareLoD("X", "Out"); +} + +framework::OpKernelType PoolOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto &dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + +void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); +} + +framework::OpKernelType PoolOpGrad::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto &dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); +} + +Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the feature, " + "and W is the width of the feature."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCHW, " + "where N is batch size, C is the number of channels, " + "H is the height of the feature, " + "and W is the width of the feature."); + + AddAttr("pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr>("ksize", + "(vector) The pooling window " + "size(height, width) of the pooling operator. " + "If global_pooling = true, ksize and paddings will " + "be ignored."); // TODO(Chengduo): Add checker. + // (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr("global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1, 1}), strides(height, " + "width) of pooling operator.") + .SetDefault({1, 1}); + // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0}), paddings(height, width) of pooling " + "operator." + "If global_pooling = true, paddings and ksize will be ignored.") + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + + AddComment(R"DOC( +Pool2d Operator. + +The pooling2d operation calculates the output based on +the input, pooling_type and ksize, strides, paddings parameters. +Input(X) and output(Out) are in NCHW format, where N is batch size, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Parameters(ksize, strides, paddings) are two elements. +These two elements represent height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: $(N, C, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, H_{out}, W_{out})$ + Where + $$ + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 + $$ + +)DOC"); +} + +Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCDHW, where N is batch size, C is " + "the number of channels, and D, H and W is the depth, height and " + "width of " + "the feature, respectively."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator." + "The format of output tensor is also NCDHW, " + "where N is batch size, C is " + "the number of channels, and D, H and W is the depth, height and " + "width of the feature, respectively."); + + AddAttr("pooling_type", + "(string) Pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr>( + "ksize", + "(vector) The pooling window size(depth, height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings will " + "be ignored."); // TODO(Chengduo): Add checker. + // (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings wille be ignored.") + .SetDefault(false); + AddAttr>( + "strides", + "(vector, default {1,1,1}) Strides(depth, height, " + "width) of the pooling operator.") + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0,0}), paddings(depth, height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + + AddComment(R"DOC( +Pool3d Operator. + +The pooling3d operation calculates the output based on +the input, pooling_type, ksize, strides, and paddings parameters. +Input(X) and output(Out) are in NCDHW format, where N is batch +size, C is the number of channels, and D, H and W are the depth, height and +width of the feature, respectively. Parameters(ksize, strides, paddings) +are three elements. These three elements represent depth, height and +width, respectively. The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: $(N, C, D_{in}, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ + +)DOC"); +} +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad, + ops::PoolOpGrad); + +REGISTER_OP_CPU_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_grad, ops::PoolGradKernel, + ops::PoolGradKernel) + +REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad, + ops::PoolOpGrad); + +REGISTER_OP_CPU_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_grad, ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..14486c07402af387ee11a127ba193ac9ac36c8a2 --- /dev/null +++ b/paddle/fluid/operators/pool_op.cu.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); + +REGISTER_OP_CUDA_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4cabd634d66e402282f17ed8724129a3e6e1ff43 --- /dev/null +++ b/paddle/fluid/operators/pool_op.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class PoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class PoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker); +}; + +template +class PoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + + std::string pooling_type = context.Attr("pooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + auto& dev_ctx = context.template device_context(); + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::MaxPool, T> + pool2d_forward; + paddle::operators::math::MaxPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + + } else if (pooling_type == "avg") { + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool2d_forward; + paddle::operators::math::AvgPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } + } break; + case 3: { + if (pooling_type == "max") { + paddle::operators::math::Pool3dFunctor< + DeviceContext, paddle::operators::math::MaxPool, T> + pool3d_forward; + paddle::operators::math::MaxPool pool_process; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool3dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool3d_forward; + paddle::operators::math::AvgPool pool_process; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } +}; + +template +class PoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* out = context.Input("Out"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + + std::string pooling_type = context.Attr("pooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + auto& dev_ctx = context.template device_context(); + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + paddle::operators::math::SetConstant set_constant; + set_constant(dev_ctx, in_x_grad, 0.0); + + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + paddle::operators::math::MaxPool2dGradFunctor + pool2d_backward; + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool2dGradFunctor< + DeviceContext, paddle::operators::math::AvgPoolGrad, T> + pool2d_backward; + paddle::operators::math::AvgPoolGrad pool_process; + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); + } + } break; + case 3: { + if (pooling_type == "max") { + paddle::operators::math::MaxPool3dGradFunctor + pool3d_backward; + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + paddle::operators::math::Pool3dGradFunctor< + DeviceContext, paddle::operators::math::AvgPoolGrad, T> + pool3d_backward; + paddle::operators::math::AvgPoolGrad pool_process; + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef6d5d867b2d38b3ca26deb1cbd9f16ca9846d0f --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_with_index_op.h" + +namespace paddle { +namespace operators { + +inline int OutputSizeMaxPool(int input_size, int filter_size, int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +class MaxPoolWithIndexOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Mask"), + "Output(Mask) of Pooling should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Pooling intput should be 4-D or 5-D tensor."); + + if (ctx->Attrs().Get("global_pooling")) { + ksize.resize(static_cast(in_x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_dims[i + 2]); + } + } + + PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, + "Input size and pooling size should be consistent."); + PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), + "Strides size and pooling size should be the same."); + PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), + "Paddings size and pooling size should be the same."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->SetOutputDim("Mask", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the image, " + "and W is the width of the image."); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCHW, " + "where N is batch size, C is " + "the number of channels, H is the height of the image " + "and W is the width of the image."); + AddOutput("Mask", + "(Tensor) The Mask tensor of pooling operator." + "The format of output tensor is also NCHW, " + "where N is batch size, C is the number of channels, " + "H is the height of the image, " + "and W is the width of the image. " + "It represents the index in the current feature map."); + + AddAttr>("ksize", + "(vector) The pooling window size(height, " + "width) of pooling operator. " + "If global_pooling = true, ksize and paddings " + "will be ignored."); // TODO(Chengduo): Add + // checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default:false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1, 1}), strides(height, " + "width) of pooling operator.") + .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default:{0, 0}), paddings(height, width) of pooling " + "operator. " + "If global_pooling = true, paddings and will be ignored.") + .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( +MaxPool2d Operator. + +The maxPooling2d with index operation calculates the output and the mask +based on the input, ksize, strides, and paddings parameters. Input(X) and +output(Out, Mask) are in NCHW format, where N is batch size, C is the +number of channels, H is the height of the feature, +and W is the width of the feature. +Parameters(ksize, strides, paddings) are two elements. +These two elements represent height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: $(N, C, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, H_{out}, W_{out})$ + Mask shape: $(N, C, H_{out}, W_{out})$ + Where + $$ + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 + $$ + +)DOC"); + } +}; + +class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor of pooling operator. " + "The format of input tensor is NCDHW, where N is batch size, C is " + "the number of channels, and D, H and W are the depth, height and " + "width of " + "the image, respectively"); + AddOutput("Out", + "(Tensor) The output tensor of pooling operator. " + "The format of output tensor is also NCDHW, " + "where N is the batch size, C is the number of channels, " + "and D, H and W are the depth, height and " + "width of the image, respectively."); + AddOutput("Mask", + "(Tensor) The Mask tensor of pooling operator. " + "The format of output tensor is also NCDHW, " + "where N is the batch size, C is the number of channels, and " + "D, H and W are the depth, height and width " + "of the image, respectively. " + "It represents the index in the current feature map."); + + AddAttr>("ksize", + "(vector) The pooling window size(depth, " + "height, width) of pooling operator. " + "If global_pooling = true, ksize and paddings " + "will be ignored."); // TODO(Chengduo): Add + // checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "global_pooling", + "(bool, default false) Whether to use the global pooling. " + "If global_pooling = true, ksize and paddings will be ignored.") + .SetDefault(false); + AddAttr>("strides", + "(vector, default {1,1,1}), strides(depth, " + "height, width) of pooling operator.") + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "(vector, default {0,0,0}), paddings(depth, " + "height, width) of pooling operator. " + "If global_pooling = true, paddings and ksize will be ignored.") + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( +MaxPool3d Operator. + +The maxpooling3d with index operation calculates the output and the mask +based on the input and ksize, strides, paddings parameters. +Input(X) and output(Out, Mask) are in NCDHW format, where N is batch +size, C is the number of channels, and D, H and W are the depth, height and +width of the feature, respectively. +Parameters(ksize, strides, paddings) are three elements. +These three elements represent depth, height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: $(N, C, D_{in}, H_{in}, W_{in})$ + Output: + Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp, + ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad, + ops::MaxPoolWithIndexOpGrad); + +REGISTER_OP_CPU_KERNEL( + max_pool2d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CPU_KERNEL( + max_pool2d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) + +REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, + ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad, + ops::MaxPoolWithIndexOpGrad); + +REGISTER_OP_CPU_KERNEL( + max_pool3d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CPU_KERNEL( + max_pool3d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..722a4d1e2a4a4ad5c1268483db46e5e9d5d4a33b --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.cu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_with_index_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + max_pool2d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( + max_pool2d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) + +REGISTER_OP_CUDA_KERNEL( + max_pool3d_with_index, + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( + max_pool3d_with_index_grad, + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h new file mode 100644 index 0000000000000000000000000000000000000000..da7ef9df73a51aabc208521880168144de6f392c --- /dev/null +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class MaxPoolWithIndexKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + Tensor* out = context.Output("Out"); + Tensor* mask = context.Output("Mask"); + + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + auto& dev_ctx = context.template device_context(); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + + switch (ksize.size()) { + case 2: { + paddle::operators::math::MaxPool2dWithIndexFunctor + pool2d_forward; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + } break; + case 3: { + paddle::operators::math::MaxPool3dWithIndexFunctor + pool3d_forward; + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } +}; + +template +class MaxPoolWithIndexGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* mask = context.Input("Mask"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + if (context.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x_grad->dims()[i + 2]); + } + } + + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + auto& device_ctx = context.template device_context(); + math::set_constant(device_ctx, in_x_grad, 0); + + switch (ksize.size()) { + case 2: { + paddle::operators::math::MaxPool2dWithIndexGradFunctor + pool2d_backward; + pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); + } break; + case 3: { + paddle::operators::math::MaxPool3dWithIndexGradFunctor + pool3d_backward; + pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, + paddings, in_x_grad); + } break; + default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d237da25a00de13057e009b6705d3241b8b26539 --- /dev/null +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -0,0 +1,179 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/positive_negative_pair_op.h" + +namespace paddle { +namespace operators { + +class PositiveNegativePairOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Score"), + "Input(Score) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("Label"), + "Input(Label) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("QueryID"), + "Input(QueryID) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PositivePair"), + "Output(PositivePair) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegativePair"), + "Output(NegativePair) of PositiveNegativePairOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NeutralPair"), + "Output(NeutralPair) of PositiveNegativePairOp should not be null."); + auto scalar_dim = framework::make_ddim({1}); + if (ctx->HasInput("AccumulatePositivePair") || + ctx->HasInput("AccumulateNegativePair") || + ctx->HasInput("AccumulateNeutralPair")) { + PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") && + ctx->HasInput("AccumulateNegativePair") && + ctx->HasInput("AccumulateNeutralPair"), + "All optional inputs(AccumulatePositivePair, " + "AccumulateNegativePair, AccumulateNeutralPair) of " + "PositiveNegativePairOp are required if one of them is " + "specified."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim, + "Shape of AccumulatePositivePair should be {1}."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim, + "Shape of AccumulateNegativePair should be {1}."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim, + "Shape of AccumulateNeutralPair should be {1}."); + } + + auto score_dim = ctx->GetInputDim("Score"); + auto label_dim = ctx->GetInputDim("Label"); + auto query_dim = ctx->GetInputDim("QueryID"); + PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + label_dim[0], score_dim[0], + "Tensor Score and Label should have the same height (batch size)."); + PADDLE_ENFORCE_EQ(label_dim[1], 1, + "The width of Label should be 1, i.e. each item should " + "have a scalar label."); + PADDLE_ENFORCE(query_dim == label_dim, + "QueryID should have the same shape as Label."); + if (ctx->HasInput("Weight")) { + PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim, + "Weight should have the same shape as Label."); + } + int column = ctx->Attrs().Get("column"); + auto depth = score_dim[1]; + PADDLE_ENFORCE(column < depth && column >= -depth, + "Attribute column should be in the range of [-%l, %l)", + depth, depth); + + ctx->SetOutputDim("PositivePair", scalar_dim); + ctx->SetOutputDim("NegativePair", scalar_dim); + ctx->SetOutputDim("NeutralPair", scalar_dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Score")->type()), + ctx.device_context()); + } +}; + +class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Score", + "(Tensor, float) Model Score on an item (with " + "respect to QueryID). It's a 2-D tensor with shape [batch_size, " + "depth], where the column specified by the attribute \"column\" " + "is used as item score."); + AddInput("Label", + "(Tensor, float) Label of an item (with repsect to " + "QueryId). It's a 2-D tensor with shape [batch_size, 1]."); + AddInput("QueryID", + "(Tensor, int64) Query ID that indicates the context. Its shape " + "should be the same as Label."); + AddInput( + "AccumulatePositivePair", + "(float) Optional. The accumulated number of positive pairs over a " + "stream of data. If provided, the output PositivePair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput( + "AccumulateNegativePair", + "(float) Optional. The accumulated number of negative pairs over a " + "stream of data. If provided, the output NegativePair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput("AccumulateNeutralPair", + "(float) Optional. The accumulated number of neutral pairs over a " + "stream of data. If provided, the output NeutralPair will be " + "initialized with this number rather than 0. it won't be modified " + "in place.") + .AsDispensable(); + AddInput("Weight", + "(float) Optional. Weight of current item. If specified, its " + "shape should be the same as Label, and the meaning of the output " + "changes from numbers of pairs to the total sum of pairs' " + "weights. Weight of a pair of items is the average of their " + "weights.") + .AsDispensable(); + AddOutput("PositivePair", + "(float) Number of positive pairs, i.e. the pairs of " + "items that are ranked correctly."); + AddOutput("NegativePair", + "(float) Number of negative pairs, i.e. the pairs of " + "items that are ranked incorrectly."); + AddOutput("NeutralPair", + "(float) Number of neutral pairs, i.e. the pairs of items " + "that have the same score.") + .AsDispensable(); + AddAttr( + "column", + "(int, default -1) The column position of Score used to rank items in " + "descending order. It must be in the range of [-rank(Score), " + "rank(Score)). " + "If `dim < 0`, the dim to reduce is `rank + dim`. " + "Noting that reducing on the first dim will make the LoD info lost.") + .SetDefault(0); + AddComment(R"DOC( +PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's +performance. + +Within some context, e.g. the "query", a LTR model generates scores for a list +of items, which gives a partial order of the items. PositiveNegativePairOp +takes a list of reference rank order (Input("Label")) and the model generated +scores (Input(Score)) as inputs and counts the pairs that ranked correctly +and incorrectly. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair, + ops::PositiveNegativePairOp, + ops::PositiveNegativePairOpMaker); +REGISTER_OP_CPU_KERNEL( + positive_negative_pair, + ops::PositiveNegativePairKernel, + ops::PositiveNegativePairKernel); diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f20f33bbeb19766d6974ea17b155cac363c01fb2 --- /dev/null +++ b/paddle/fluid/operators/positive_negative_pair_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/utils/Logging.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class PositiveNegativePairKernel : public framework::OpKernel { + public: + struct PredictionResult { + PredictionResult(T score, T label, T weight) + : score(score), label(label), weight(weight) {} + T score; + T label; + T weight; + }; + + void Compute(const framework::ExecutionContext& context) const override { + auto score_t = context.Input("Score"); + auto label_t = context.Input("Label"); + auto query_t = context.Input("QueryID"); + auto acc_positive_t = context.Input("AccumulatePositivePair"); + auto acc_negative_t = context.Input("AccumulateNegativePair"); + auto acc_neutral_t = context.Input("AccumulateNeutralPair"); + auto positive_t = context.Output("PositivePair"); + auto negative_t = context.Output("NegativePair"); + auto neutral_t = context.Output("NeutralPair"); + auto weight_t = context.Input("Weight"); + + auto score = score_t->data(); + auto label = label_t->data(); + auto query = query_t->data(); + const T* weight = nullptr; + if (weight_t != nullptr) { + weight = weight_t->data(); + } + T* positive = positive_t->mutable_data(context.GetPlace()); + T* negative = negative_t->mutable_data(context.GetPlace()); + T* neutral = neutral_t->mutable_data(context.GetPlace()); + + auto score_dim = score_t->dims(); + auto batch_size = score_dim[0]; + auto width = score_dim[1]; + auto column = context.Attr("column"); + if (column < 0) { + column += width; + } + + // construct document instances for each query: Query => List[, ...] + std::unordered_map> predictions; + for (auto i = 0; i < batch_size; ++i) { + if (predictions.find(query[i]) == predictions.end()) { + predictions.emplace( + std::make_pair(query[i], std::vector())); + } + predictions[query[i]].emplace_back(score[i * width + column], label[i], + weight_t != nullptr ? weight[i] : 1.0); + } + + // for each query, accumulate pair counts + T pos = 0, neg = 0, neu = 0; + if (acc_positive_t != nullptr && acc_negative_t != nullptr && + acc_neutral_t != nullptr) { + pos = acc_positive_t->data()[0]; + neg = acc_negative_t->data()[0]; + neu = acc_neutral_t->data()[0]; + } + auto evaluate_one_list = [&pos, &neg, + &neu](std::vector vec) { + for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) { + for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) { + if (ite1->label == ite2->label) { // labels are equal, ignore. + continue; + } + T w = (ite1->weight + ite2->weight) * 0.5; + if (ite1->score == ite2->score) { + neu += w; + } + (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0 + ? pos += w + : neg += w; + } + } + }; + for (auto prediction : predictions) { + evaluate_one_list(prediction.second); + } + *positive = pos; + *negative = neg; + *neutral = neu; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..30d594719c7274b90a88127028035a49c25e32e7 --- /dev/null +++ b/paddle/fluid/operators/precision_recall_op.cc @@ -0,0 +1,182 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/precision_recall_op.h" + +namespace paddle { +namespace operators { + +class PrecisionRecallOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("MaxProbs"), + "Input(MaxProbs) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input(Indices) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"), + "Output(BatchMetrics) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"), + "Output(AccumMetrics) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"), + "Output(AccumStatesInfo) should not be null."); + + int64_t cls_num = + static_cast(ctx->Attrs().Get("class_number")); + auto max_probs_dims = ctx->GetInputDim("MaxProbs"); + auto labels_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(max_probs_dims[1], 1, + "Each instance contains one max probability, so the " + "shape of Input(MaxProbs) should be [batch_size, 1]."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims, + "The shape of Input(Indices) should be [batch_size, 1]."); + PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0], + "The 1st dimension of Input(MaxProbs) and " + "Input(Labels) both are batch_size and the shape should " + "be the same."); + PADDLE_ENFORCE_EQ(labels_dims[1], 1, + "The 2nd dimension of Input(Labels) contains instance " + "label and the shape should be equal to 1."); + if (ctx->HasInput("Weights")) { + auto weights_dims = ctx->GetInputDim("Weights"); + PADDLE_ENFORCE_EQ(weights_dims, + framework::make_ddim({max_probs_dims[0], 1}), + "The shape of Input(Weights) should be " + "[batch_size, 1]."); + } + if (ctx->HasInput("StatesInfo")) { + auto states_dims = ctx->GetInputDim("StatesInfo"); + PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}), + "The shape of Input(StatesInfo) should be " + "[class_number, 4]."); + } + + // Layouts of BatchMetrics and AccumMetrics both are: + // [ + // macro average precision, macro average recall, macro average F1 score, + // micro average precision, micro average recall, micro average F1 score + // ] + ctx->SetOutputDim("BatchMetrics", {6}); + ctx->SetOutputDim("AccumMetrics", {6}); + // Shape of AccumStatesInfo is [class_number, 4] + // The layout of each row is: + // [ TP, FP, TN, FN ] + ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("MaxProbs")->type()), + ctx.device_context()); + } +}; + +class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("MaxProbs", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each row contains the max probability " + "of an instance which computed by the previous top_k (k=1) " + "operator."); + AddInput("Indices", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each row contains the corresponding " + "index which computed by the previous top_k (k=1) operator."); + AddInput("Labels", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. Each element is a label and the " + "value should be in [0, class_number - 1]."); + AddInput("Weights", + "(Tensor, default Tensor) A 2-D tensor with shape N x 1, " + "where N is the batch size. This input is optional. If provided, " + "weight of instance would be considered when computing metrics.") + .AsDispensable(); + AddInput("StatesInfo", + "(Tensor, default Tensor) A 2-D tensor with shape D x 4, " + "where D is the number of classes. This input is optional. If " + "provided, current state will be accumulated to this state and " + "the accumulation state will be the output state.") + .AsDispensable(); + AddOutput("BatchMetrics", + "(Tensor, default Tensor) A 1-D tensor with shape {6}. " + "This output tensor contains metrics for current batch data. " + "The layout is [macro average precision, macro average recall, " + "macro f1 score, micro average precision, micro average recall, " + "micro f1 score]."); + AddOutput("AccumMetrics", + "(Tensor, default Tensor) A 1-D tensor with shape {6}. " + "This output tensor contains metrics for accumulated data. " + "The layout is [macro average precision, macro average recall, " + "macro f1 score, micro average precision, micro average recall, " + "micro f1 score]."); + AddOutput("AccumStatesInfo", + "(Tensor, default Tensor) A 2-D tensor with shape D x 4, " + "where D is equal to class number. This output tensor contains " + "accumulated state variables used to compute metrics. The layout " + "for each class is [true positives, false positives, " + "true negatives, false negatives]."); + AddAttr("class_number", "(int) Number of classes to be evaluated."); + AddComment(R"DOC( +Precision Recall Operator. + +When given Input(Indices) and Input(Labels), this operator can be used +to compute various metrics including: +1. macro average precision +2. macro average recall +3. macro f1 score +4. micro average precision +5. micro average recall +6. micro f1 score + +To compute the above metrics, we need to do statistics for true positives, +false positives and false negatives. Here the count of true negatives is not +necessary, but counting it may provide potential usage and the cost is +trivial, so the operator also provides the count of true negatives. + +We define state as a 2-D tensor with shape [class_number, 4]. Each row of a +state contains statistic variables for corresponding class. Layout of each row +is: TP(true positives), FP(false positives), TN(true negatives), +FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be +calculated by given weight instead of the instance count. + +This operator also supports metrics computing for cross-batch situation. To +achieve this, Input(StatesInfo) should be provided. State of current batch +data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo) +is the accumulation state. + +Output(BatchMetrics) is metrics of current batch data while +Output(AccumStatesInfo) is metrics of accumulation data. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp, + ops::PrecisionRecallOpMaker); +REGISTER_OP_CPU_KERNEL( + precision_recall, + ops::PrecisionRecallKernel, + ops::PrecisionRecallKernel); diff --git a/paddle/fluid/operators/precision_recall_op.h b/paddle/fluid/operators/precision_recall_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7dae86b76fc8d50e2ed5fe353920b68f7a846fb1 --- /dev/null +++ b/paddle/fluid/operators/precision_recall_op.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +enum StateVariable { TP = 0, FP, TN, FN }; + +template +class PrecisionRecallKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in0 = ctx.Input("Indices"); + auto* in1 = ctx.Input("Labels"); + auto* in2 = ctx.Input("Weights"); + auto* in3 = ctx.Input("StatesInfo"); + auto* out0 = ctx.Output("BatchMetrics"); + auto* out1 = ctx.Output("AccumMetrics"); + auto* out2 = ctx.Output("AccumStatesInfo"); + + const int* ids_data = in0->data(); + const int* labels_data = in1->data(); + size_t cls_num = static_cast(ctx.Attr("class_number")); + const T* weights_data = in2 ? in2->data() : nullptr; + const T* states_data = in3 ? in3->data() : nullptr; + double* batch_metrics_data = out0->mutable_data(ctx.GetPlace()); + double* accum_metrics_data = out1->mutable_data(ctx.GetPlace()); + out2->mutable_data(ctx.GetPlace()); + auto accum_states = EigenMatrix::From(*out2); + accum_states.setZero(); + T* accum_states_data = out2->data(); + + size_t sample_num = in0->dims()[0]; + size_t state_var_num = 4; // TP FP TN FN + + // get states info for current batch + for (size_t i = 0; i < sample_num; ++i) { + size_t idx = ids_data[i]; + size_t label = labels_data[i]; + + PADDLE_ENFORCE(idx >= 0 && idx < cls_num, + "Class index of each instance should be in " + "[0, class_number)."); + PADDLE_ENFORCE(label >= 0 && label < cls_num, + "Label of each instance should be in [0, class_number)."); + + T w = weights_data ? weights_data[i] : 1.0; + if (idx == label) { + accum_states_data[idx * state_var_num + TP] += w; + for (size_t j = 0; j < cls_num; ++j) { + accum_states_data[j * state_var_num + TN] += w; + } + accum_states_data[idx * state_var_num + TN] -= w; + } else { + accum_states_data[label * state_var_num + FN] += w; + accum_states_data[idx * state_var_num + FP] += w; + for (size_t j = 0; j < cls_num; ++j) { + accum_states_data[j * state_var_num + TN] += w; + } + accum_states_data[idx * state_var_num + TN] -= w; + accum_states_data[label * state_var_num + TN] -= w; + } + } + + ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num, + cls_num); + + if (states_data) { + for (size_t i = 0; i < cls_num; ++i) { + for (size_t j = 0; j < state_var_num; ++j) { + size_t idx = i * state_var_num + j; + accum_states_data[idx] += states_data[idx]; + } + } + } + + ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num, + cls_num); + } + + // expose to be reused + static inline T CalcPrecision(T tp_count, T fp_count) { + if (tp_count > 0.0 || fp_count > 0.0) { + return tp_count / (tp_count + fp_count); + } + return 1.0; + } + + static inline T CalcRecall(T tp_count, T fn_count) { + if (tp_count > 0.0 || fn_count > 0.0) { + return tp_count / (tp_count + fn_count); + } + return 1.0; + } + + static inline T CalcF1Score(T precision, T recall) { + if (precision > 0.0 || recall > 0.0) { + return 2 * precision * recall / (precision + recall); + } + return 0.0; + } + + protected: + void ComputeMetrics(const T* states_data, double* metrics_data, + size_t state_var_num, size_t cls_num) const { + T total_tp_count = 0; + T total_fp_count = 0; + T total_fn_count = 0; + T macro_avg_precision = 0.0; + T macro_avg_recall = 0.0; + + for (size_t i = 0; i < cls_num; ++i) { + T tp_count = states_data[i * state_var_num + TP]; + T fp_count = states_data[i * state_var_num + FP]; + T fn_count = states_data[i * state_var_num + FN]; + total_tp_count += tp_count; + total_fp_count += fp_count; + total_fn_count += fn_count; + macro_avg_precision += CalcPrecision(tp_count, fp_count); + macro_avg_recall += CalcRecall(tp_count, fn_count); + } + macro_avg_precision /= cls_num; + macro_avg_recall /= cls_num; + T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall); + + T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count); + T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count); + T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall); + + // fill metrics data + metrics_data[0] = macro_avg_precision; + metrics_data[1] = macro_avg_recall; + metrics_data[2] = macro_f1_score; + metrics_data[3] = micro_avg_precision; + metrics_data[4] = micro_avg_recall; + metrics_data[5] = micro_f1_score; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..22b970d971221e86a25e0b72f3d3704e5cee5d7f --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prelu_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class PReluOp : public framework::OperatorWithKernel { + public: + PReluOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null"); + PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1, + "Size of weight Alpha must be one."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class PReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of prelu operator."); + AddInput("Alpha", "The alpha weight of prelu operator."); + AddOutput("Out", "The output tensor of prelu operator."); + AddComment(R"DOC( +PRelu Operator. + +The equation is: + +$$ +f(x) = +\begin{cases} +\alpha * x, \quad \text{if} \ x < 0 \\ +x, \qquad \text{if} \ x >= 0 +\end{cases} +$$ + +The input `X` can carry the LoD (Level of Details) information, +or not. And the output shares the LoD information with input `X`. + +)DOC"); + } +}; + +// The operator to calculate gradients of a prelu operator. +class PReluGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("Alpha"), + ctx->GetInputDim("Alpha")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad, + ops::PReluGradOp); +REGISTER_OP_CPU_KERNEL( + prelu, ops::PReluKernel); +REGISTER_OP_CPU_KERNEL( + prelu_grad, + ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..038b09a493c5064d5419260b2fbfdf56b6bb5982 --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prelu_op.h" + +REGISTER_OP_CUDA_KERNEL( + prelu, + paddle::operators::PReluKernel); +REGISTER_OP_CUDA_KERNEL(prelu_grad, + paddle::operators::PReluGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..85ad75d479001ec5dad1b796d4932c7e6c4ab7af --- /dev/null +++ b/paddle/fluid/operators/prelu_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using platform::Transform; + +template +class PReluFunctor { + public: + explicit PReluFunctor(const T* alpha) : alpha_(alpha) {} + + HOSTDEVICE T operator()(const T& x) const { + if (x > 0) + return x; + else + return x * (*alpha_); + } + + private: + const T* alpha_; +}; + +template +class PReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* alpha = context.Input("Alpha"); + auto* out = context.Output("Out"); + + const T* x_ptr = x->data(); + T* o_ptr = out->mutable_data(context.GetPlace()); + + auto* alpha_ptr = alpha->data(); + + int numel = x->numel(); + + Transform trans; + trans(context.template device_context(), x_ptr, + x_ptr + numel, o_ptr, PReluFunctor(alpha_ptr)); + } +}; + +template +class PReluGradFunctor { + public: + explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {} + + HOSTDEVICE T operator()(const T& out, const T& dout) const { + if (out > 0) + return dout; + else + return dout * (*alpha_); + } + + private: + const T* alpha_; +}; + +template +class PReluGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dx = context.Output(framework::GradVarName("X")); + auto* dout = context.Input(framework::GradVarName("Out")); + + auto* out = context.Input("Out"); + auto* alpha = context.Input("Alpha"); + auto* alpha_ptr = alpha->data(); + + T* dx_ptr = dx->mutable_data(context.GetPlace()); + const T* dout_ptr = dout->data(); + const T* out_ptr = out->data(); + int numel = dx->numel(); + + Transform trans; + trans(context.template device_context(), out_ptr, + out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor(alpha_ptr)); + + // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3616545309e8c279f61a22e571a5e71335c47f93 --- /dev/null +++ b/paddle/fluid/operators/print_op.cc @@ -0,0 +1,283 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { + +#define CLOG std::cout + +const std::string kForward = "FORWARD"; +const std::string kBackward = "BACKWARD"; +const std::string kBoth = "BOTH"; + +struct Formater { + std::string message; + std::string name; + std::vector dims; + std::type_index dtype{typeid(char)}; + framework::LoD lod; + int summarize; + void* data{nullptr}; + + void operator()(size_t size) { + PrintMessage(); + PrintName(); + PrintDims(); + PrintDtype(); + PrintLod(); + PrintData(size); + } + + private: + void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; } + void PrintName() { + if (!name.empty()) { + CLOG << "Tensor[" << name << "]" << std::endl; + } + } + void PrintDims() { + if (!dims.empty()) { + CLOG << "\tshape: ["; + for (auto i : dims) { + CLOG << i << ","; + } + CLOG << "]" << std::endl; + } + } + void PrintDtype() { + if (dtype.hash_code() != typeid(char).hash_code()) { + CLOG << "\tdtype: " << dtype.name() << std::endl; + } + } + void PrintLod() { + if (!lod.empty()) { + CLOG << "\tLoD: ["; + for (auto level : lod) { + CLOG << "[ "; + for (auto i : level) { + CLOG << i << ","; + } + CLOG << " ]"; + } + CLOG << "]" << std::endl; + } + } + + void PrintData(size_t size) { + PADDLE_ENFORCE_NOT_NULL(data); + // print float + if (dtype.hash_code() == typeid(float).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(double).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(int).hash_code()) { + Display(size); + } + if (dtype.hash_code() == typeid(int64_t).hash_code()) { + Display(size); + } + } + + template + void Display(size_t size) { + auto* d = (T*)data; + CLOG << "\tdata: "; + if (summarize != -1) { + summarize = std::min(size, (size_t)summarize); + for (int i = 0; i < summarize; i++) { + CLOG << d[i] << ","; + } + } else { + for (size_t i = 0; i < size; i++) { + CLOG << d[i] << ","; + } + } + CLOG << std::endl; + } +}; + +// TODO(ChunweiYan) there should be some other printers for TensorArray +class TensorPrintOp : public framework::OperatorBase { + public: + TensorPrintOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + TensorPrintOp(const TensorPrintOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented."); + } + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + const framework::Variable* in_var_ptr = nullptr; + std::string phase = kForward; + std::string printed_var_name = ""; + + auto& inputs = Inputs(); + if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) { + in_var_ptr = scope.FindVar(Input("In")); + printed_var_name = Inputs("In").front(); + } else if (inputs.find("In@GRAD") != inputs.end() && + !Inputs("In@GRAD").empty()) { + in_var_ptr = scope.FindVar(Input("In@GRAD")); + printed_var_name = Inputs("In@GRAD").front(); + phase = kBackward; + } else { + PADDLE_THROW("Unknown phase, should be forward or backward."); + } + + PADDLE_ENFORCE_NOT_NULL(in_var_ptr); + + auto& in_tensor = in_var_ptr->Get(); + auto* out_var_ptr = scope.FindVar(Output("Out")); + auto& out_tensor = *out_var_ptr->GetMutable(); + + // Just copy data from input tensor to output tensor + // output tensor share same memory with input tensor + out_tensor.ShareDataWith(in_tensor); + out_tensor.set_lod(in_tensor.lod()); + + std::string print_phase = Attr("print_phase"); + if (print_phase != phase && print_phase != kBoth) { + return; + } + + int first_n = Attr("first_n"); + if (first_n > 0 && ++times_ > first_n) return; + + framework::LoDTensor printed_tensor; + printed_tensor.set_lod(in_tensor.lod()); + printed_tensor.Resize(in_tensor.dims()); + + if (platform::is_cpu_place(in_tensor.place())) { + printed_tensor.ShareDataWith(in_tensor); + } else { + // copy data to cpu to print + platform::CPUPlace place; + framework::Copy(in_tensor, place, &printed_tensor); + } + + Formater formater; + if (Attr("print_tensor_name")) { + formater.name = printed_var_name; + } + if (Attr("print_tensor_type")) { + formater.dtype = printed_tensor.type(); + } + if (Attr("print_tensor_shape")) { + auto& dims = printed_tensor.dims(); + formater.dims.resize(dims.size()); + for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i]; + } + if (Attr("print_tensor_lod")) { + formater.lod = printed_tensor.lod(); + } + formater.summarize = Attr("summarize"); + formater.data = (void*)printed_tensor.data(); + formater(printed_tensor.numel()); + } + + private: + mutable int times_{0}; +}; + +class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker { + public: + PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("In", "Input tensor to be displayed."); + AddAttr("first_n", "Only log `first_n` number of times."); + AddAttr("message", "A string message to print as a prefix."); + AddAttr("summarize", "Number of elements printed."); + AddAttr("print_tensor_name", "Whether to print the tensor name."); + AddAttr("print_tensor_type", "Whether to print the tensor's dtype."); + AddAttr("print_tensor_shape", "Whether to print the tensor's shape."); + AddAttr("print_tensor_lod", "Whether to print the tensor's lod."); + AddAttr( + "print_phase", + "(string, default 'BOTH') Which phase to display including 'FORWARD' " + "'BACKWARD' and 'BOTH'.") + .SetDefault(kBoth) + .InEnum({kForward, kBackward, kBoth}); + AddOutput("Out", "Output tensor with same data as input tensor."); + AddComment(R"DOC( +Creates a print op that will print when a tensor is accessed. + +Wraps the tensor passed in so that whenever that a tensor is accessed, +the message `message` is printed, along with the current value of the +tensor `t`.)DOC"); + } +}; + +class InferShapeForward : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null."); + context->ShareLoD("In", /*->*/ "Out"); + context->SetOutputDim("Out", context->GetInputDim("In")); + } +}; + +class InferShapeBackward : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("In@GRAD"), + "Input(In@GRAD) should not be null."); + context->ShareLoD("In@GRAD", /*->*/ "Out"); + context->SetOutputDim("Out", context->GetInputDim("In@GRAD")); + } +}; + +class InferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override {} +}; + +class PrintOpProtoAndCheckGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("print_grad"); + op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out")); + op_desc_ptr->SetOutput("Out", InputGrad("In")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker, + ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward, + ops::InferVarType); +REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward); diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed48603e17f38f89705186fb9fb992f69d26d2ff --- /dev/null +++ b/paddle/fluid/operators/prior_box_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prior_box_op.h" + +namespace paddle { +namespace operators { + +class PriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of PriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of PriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + + auto min_sizes = ctx->Attrs().Get>("min_sizes"); + auto max_sizes = ctx->Attrs().Get>("max_sizes"); + auto variances = ctx->Attrs().Get>("variances"); + auto aspect_ratios = ctx->Attrs().Get>("aspect_ratios"); + bool flip = ctx->Attrs().Get("flip"); + + std::vector aspect_ratios_vec; + ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + + int num_priors = aspect_ratios_vec.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(), + "The number of min_size and max_size must be equal."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i], + "max_size[%d] must be greater than min_size[%d].", i, + i); + num_priors += 1; + } + } + + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } +}; + +class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor, default Tensor), " + "the input feature data of PriorBoxOp, The layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of PriorBoxOp, The layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + + AddAttr>("min_sizes", + "(vector) List of min sizes " + "of generated prior boxes.") + .AddCustomChecker([](const std::vector& min_sizes) { + PADDLE_ENFORCE_GT(min_sizes.size(), 0, + "Size of min_sizes must be at least 1."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(min_sizes[i], 0, + "min_sizes[%d] must be positive.", i); + } + }); + AddAttr>( + "max_sizes", + "(vector) List of max sizes of generated prior boxes."); + AddAttr>( + "aspect_ratios", + "(vector) List of aspect ratios of generated prior boxes."); + + AddAttr>( + "variances", + "(vector) List of variances to be encoded in prior boxes.") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + AddAttr("flip", "(bool) Whether to flip aspect ratios.") + .SetDefault(true); + AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") + .SetDefault(true); + + AddAttr("step_w", + "Prior boxes step across width, 0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_w) { + PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0."); + }); + AddAttr("step_h", + "Prior boxes step across height, 0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_h) { + PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0."); + }); + + AddAttr("offset", + "(float) " + "Prior boxes center offset.") + .SetDefault(0.5); + AddComment(R"DOC( +Prior box operator +Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. +Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + +Please get more information from the following papers: +https://arxiv.org/abs/1512.02325. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker); +REGISTER_OP_CPU_KERNEL( + prior_box, ops::PriorBoxOpKernel, + ops::PriorBoxOpKernel); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fd07041233495660605e9cf9acb33d57eb57bc30 --- /dev/null +++ b/paddle/fluid/operators/prior_box_op.h @@ -0,0 +1,201 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, + bool flip, + std::vector& output_aspect_ratior) { + constexpr float epsilon = 1e-6; + output_aspect_ratior.clear(); + output_aspect_ratior.push_back(1.0f); + for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { + float ar = input_aspect_ratior[i]; + bool already_exist = false; + for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { + if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + already_exist = true; + break; + } + } + if (!already_exist) { + output_aspect_ratior.push_back(ar); + if (flip) { + output_aspect_ratior.push_back(1.0f / ar); + } + } + } +} + +template +struct ClipFunctor { + HOSTDEVICE inline T operator()(T in) const { + return std::min(std::max(in, 0.), 1.); + } +}; + +template +class PriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + T inv_img_width = 1.0 / img_width; + T inv_img_height = 1.0 / img_height; + + auto e_boxes = framework::EigenTensor::From(*boxes); + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + T box_width, box_height; + int idx = 0; + for (size_t s = 0; s < min_sizes.size(); ++s) { + int min_size = min_sizes[s]; + // first prior: aspect_ratio = 1, size = min_size + box_width = box_height = min_size; + // xmin + e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + + idx++; + if (max_sizes.size() > 0) { + int max_size = max_sizes[s]; + // second prior: aspect_ratio = 1, + // size = sqrt(min_size * max_size) + box_width = box_height = sqrt(min_size * max_size); + // xmin + e_boxes(h, w, idx, 0) = + (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = + (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + idx++; + } + + // rest of priors + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + // xmin + e_boxes(h, w, idx, 0) = + (center_x - box_width * 0.5) * inv_img_width; + // ymin + e_boxes(h, w, idx, 1) = + (center_y - box_height * 0.5) * inv_img_height; + // xmax + e_boxes(h, w, idx, 2) = + (center_x + box_width * 0.5) * inv_img_width; + // ymax + e_boxes(h, w, idx, 3) = + (center_y + box_height * 0.5) * inv_img_height; + idx++; + } + } + } + } + + if (clip) { + platform::Transform trans; + ClipFunctor clip_func; + trans(ctx.template device_context(), + boxes->data(), boxes->data() + boxes->numel(), + boxes->data(), clip_func); + } + + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + auto var_et = framework::EigenTensor::From(var_t); + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); + + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9e3894c576a94c094c0f4b72e3b6519c4ec26e1 --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/proximal_adagrad_op.h" + +namespace paddle { +namespace operators { + +class ProximalAdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("LearningRate"), + "Input(LearningRate) of ProximalAdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of ProximalAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("MomentOut"), + "Output(MomentOut) of ProximalAdagradOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad of ProximalAdagrad Op must have same dimension."); + + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Moment"), + "Param and Moment of ProximalAdagrad Op must have same dimension."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("MomentOut", param_dim); + } +}; + +class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated."); + AddInput("Moment", + "(Tensor, default Tensor) " + "Moment parameter that has to be updated."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("MomentOut", "(Tensor) Output updated moment value."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddComment(R"DOC( +Proximal Adagrad Optimizer. + +Optimizer that implements the proximal adagrad algorithm: + +$$ +moment = moment + grad * grad \\ +prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\ +param = sign(prox\_param) / (1 + learning\_rate * l2) * + \max(|prox\_param| - learning\_rate * l1 , 0) +$$ + +The paper that proposed Proximal GD: +(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf) +Here, we use the adagrad learning rate as specified here: +(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, + ops::ProximalAdagradOpMaker); +REGISTER_OP_CPU_KERNEL( + proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/proximal_adagrad_op.cu b/paddle/fluid/operators/proximal_adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..54c75b3abb8e84b2aa55d044e79423cf86523d76 --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/proximal_adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/proximal_adagrad_op.h b/paddle/fluid/operators/proximal_adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..70205a8d11f757d08150a81d1369133778ad996c --- /dev/null +++ b/paddle/fluid/operators/proximal_adagrad_op.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ProximalAdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* moment_out = ctx.Output("MomentOut"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + + auto grad = ctx.Input("Grad"); + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto m = EigenVector::Flatten(*ctx.Input("Moment")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto m_out = EigenVector::Flatten(*moment_out); + auto* place = ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + m_out.device(*place) = m + g * g; + auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); + if (l1 > static_cast(0)) { + p_out.device(*place) = + prox_param.sign() * + (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) + .cwiseMax(static_cast(0.0))) / + (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); + } else { + p_out.device(*place) = + prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..de7c6843c8ba7c1cc9229a11707de7a1400deee1 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/proximal_gd_op.h" + +namespace paddle { +namespace operators { + +class ProximalGDOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of ProximalGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of ProximalGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of ProximalGDOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of ProximalGDOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"), + "Two input of ProximalGD Op's dimension must be same."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + } +}; + +class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + + AddAttr("l1", + "(float, default 0.0) " + "L1 regularization strength.") + .SetDefault(0.0f); + AddAttr("l2", + "(float, default 0.0) " + "L2 regularization strength.") + .SetDefault(0.0f); + AddComment(R"DOC( +ProximalGD Operator. + +Optimizer that implements the proximal gradient descent algorithm: + +$$ +prox\_param = param - learning\_rate * grad \\ +param = sign(prox\_param) / (1 + learning\_rate * l2) * + \max(|prox\_param| - learning\_rate * l1, 0) +$$ + +The paper that proposed Proximal Gradient Descent: +(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, + ops::ProximalGDOpMaker); +REGISTER_OP_CPU_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/proximal_gd_op.cu b/paddle/fluid/operators/proximal_gd_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..97b672e872c99783ef4c0cd085e4b86380a06e10 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +You may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/proximal_gd_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/proximal_gd_op.h b/paddle/fluid/operators/proximal_gd_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8372380f25277e7774b72e144e00ea80e76a71e0 --- /dev/null +++ b/paddle/fluid/operators/proximal_gd_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class ProximalGDOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + + param_out->mutable_data(ctx.GetPlace()); + + auto grad = ctx.Input("Grad"); + + auto l1 = static_cast(ctx.Attr("l1")); + auto l2 = static_cast(ctx.Attr("l2")); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto g = EigenVector::Flatten(*grad); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + + auto p_out = EigenVector::Flatten(*param_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + auto prox_param = p - lr.broadcast(grad_dsize) * g; + if (l1 > 0) { + p_out.device(place) = + prox_param.sign() * + (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) + .cwiseMax(T(0.0))) / + (1.0 + (lr * l2).broadcast(grad_dsize))); + } else { + p_out.device(place) = + prox_param / (1.0 + (lr * l2).broadcast(grad_dsize)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..222ca73d2acfa8cc3d6fa6a3badce4606be9bcb0 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rank_loss_op.h" + +namespace paddle { +namespace operators { + +class RankLossOp : public framework::OperatorWithKernel { + public: + RankLossOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null."); + + auto label_dims = ctx->GetInputDim("Label"); + auto left_dims = ctx->GetInputDim("Left"); + auto right_dims = ctx->GetInputDim("Right"); + + PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims), + "All inputs must have the same size."); + PADDLE_ENFORCE( + (label_dims.size() == 2) && (label_dims[1] == 1), + "All inputs must be 2-D tensors with shape [batch_size x 1]."); + ctx->SetOutputDim("Out", label_dims); + } +}; + +class RankLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Label", + "(2-D Tensor with shape [batch_size x 1]) " + "The label indicating A ranked higher than B or not."); + AddInput("Left", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc A."); + AddInput("Right", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc B."); + AddOutput("Out", + "(2-D Tensor with shape [batch_size x 1]) " + "The output loss of RankLoss operator."); + AddComment(R"DOC( +RankLoss Operator. + +RankLoss operator for RankNet +(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). +RankNet is a pairwise ranking model with +one training sample consisting of a pair of doc A and B, and the label P +indicating that A is ranked higher than B or not: + +P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of +the input pair. + +The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label +(P_{i,j}), which represent the output score of RankNet for the two docs and +the label respectively, and yields the rank loss C_{i,j} using the following +equation: + +$$ + C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\ + o_{i,j} = o_i - o_j \\ + \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} +$$ + +The operator can take batch inputs with size batch_size (batch_size >= 1). + +)DOC"); + } +}; + +class RankLossGradOp : public framework::OperatorWithKernel { + public: + RankLossGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + auto dims = ctx->GetInputDim("Left"); + auto left_grad_name = framework::GradVarName("Left"); + auto right_grad_name = framework::GradVarName("Right"); + + if (ctx->HasOutput(left_grad_name)) { + ctx->SetOutputDim(left_grad_name, dims); + } + + if (ctx->HasOutput(right_grad_name)) { + ctx->SetOutputDim(right_grad_name, dims); + } + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad, + ops::RankLossGradOp); +REGISTER_OP_CPU_KERNEL( + rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL( + rank_loss_grad, + ops::RankLossGradKernel); diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b182ced70d2a234237f1de822c3dc81047ebda7 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rank_loss_op.h" + +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..08bb2c28218e8c478af426b560efc9a4b6161696 --- /dev/null +++ b/paddle/fluid/operators/rank_loss_op.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RankLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* label_t = ctx.Input("Label"); + auto* left_t = ctx.Input("Left"); + auto* right_t = ctx.Input("Right"); + out_t->mutable_data(ctx.GetPlace()); + + auto out = framework::EigenVector::Flatten(*out_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto left = framework::EigenVector::Flatten(*left_t); + auto right = framework::EigenVector::Flatten(*right_t); + + auto& dev = *ctx.template device_context().eigen_device(); + out.device(dev) = + (1. + (left - right).exp()).log() - label * (left - right); + } +}; + +template +class RankLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_left_t = + ctx.Output(framework::GradVarName("Left")); + auto* d_right_t = + ctx.Output(framework::GradVarName("Right")); + + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* label_t = ctx.Input("Label"); + auto* left_t = ctx.Input("Left"); + auto* right_t = ctx.Input("Right"); + + auto& dev = *ctx.template device_context().eigen_device(); + auto d_out = framework::EigenVector::Flatten(*d_out_t); + auto label = framework::EigenVector::Flatten(*label_t); + auto left = framework::EigenVector::Flatten(*left_t); + auto right = framework::EigenVector::Flatten(*right_t); + + // compute d_left + if (d_left_t) { + d_left_t->mutable_data(ctx.GetPlace()); + auto d_left = framework::EigenVector::Flatten(*d_left_t); + d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label); + } + // compute d_right + if (d_right_t) { + d_right_t->mutable_data(ctx.GetPlace()); + auto d_right = framework::EigenVector::Flatten(*d_right_t); + d_right.device(dev) = + -d_out * (1.0 / (1. + (right - left).exp()) - label); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d562c291911f54c9d1e8fed2e84035808bffbb7 --- /dev/null +++ b/paddle/fluid/operators/read_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { + +class ReadInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Reader"), + "The ReadOp must take a reader as input."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "The ReadOp should be assigned with output."); + std::vector reader_dims = ctx->GetReaderDims("Reader"); + std::vector out_names = ctx->Outputs("Out"); + PADDLE_ENFORCE_EQ( + reader_dims.size(), out_names.size(), + "The reader's dim number doesn't match the output number."); + ctx->SetOutputsDim("Out", reader_dims); + } +}; + +class ReadInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + std::string reader_name = op_desc.Input("Reader")[0]; + std::vector out_names = op_desc.Output("Out"); + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + auto dtypes = reader->GetDataTypes(); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + for (size_t i = 0; i < dtypes.size(); ++i) { + framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); + out.SetType(framework::proto::VarDesc::LOD_TENSOR); + out.SetDataType(dtypes[i]); + } + } +}; + +class ReadOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + void Run(const framework::Scope& scope, + const platform::Place& dev_place) const override { + framework::ReaderHolder* reader = + scope.FindVar(Input("Reader"))->GetMutable(); + if (!reader->HasNext()) { + reader->ReInit(); + PADDLE_ENFORCE( + reader->HasNext(), + "Reader can not read the next data even it has been re-initialized."); + } + std::vector out_arg_names = Outputs("Out"); + std::vector ins; + reader->ReadNext(&ins); + PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); + for (size_t i = 0; i < ins.size(); ++i) { + auto* out = + scope.FindVar(out_arg_names[i])->GetMutable(); + out->ShareDataWith(ins[i]); + out->set_lod(ins[i].lod()); + } + } +}; + +class ReadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput("Reader", "(ReaderHolder) The executed reader."); + AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable(); + AddComment(R"DOC( + Read Operator + + Execute a given reader once and output data. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker, + paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4b9b8dab9b0394752d538aa5f59be3c06d0188f --- /dev/null +++ b/paddle/fluid/operators/recurrent_op.cc @@ -0,0 +1,635 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +constexpr char kInputs[] = "inputs"; +constexpr char kInitialStates[] = "initial_states"; +constexpr char kParameters[] = "parameters"; +constexpr char kOutputs[] = "outputs"; +constexpr char kStepScopes[] = "step_scopes"; +constexpr char kExStates[] = "ex_states"; +constexpr char kStates[] = "states"; +constexpr char kStepBlock[] = "sub_block"; +constexpr char kReverse[] = "reverse"; +constexpr char kIsTrain[] = "is_train"; +#define GRAD_SUFFIX "@GRAD" +constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX; +constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX; +constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX; +constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX; + +using StepScopeVar = std::vector; + +// StepScopes manages scopes inside RNN. +// StepScopes::CurScope() get the current scope +// StepScopes::ExScope() get the ex-scope, or scope in previous time step. +// StepScopes::Next() move to next time step. +// +// if is_train = False, then +// there are two scopes for the RNN and just support forward. +// else +// the len(scopes) == seq_len +// +// if is_backward = True, then +// reversely access scopes +// else +// access scopes from begin to end. +class StepScopes { + public: + StepScopes(const framework::Scope &parent, StepScopeVar *scopes, + bool is_train, size_t seq_len, bool is_backward = false) + : counter_(is_backward ? seq_len - 1 : 0UL), + scopes_(scopes), + is_train_(is_train), + is_backward_(is_backward) { + size_t num_step_scopes = is_train ? seq_len : 2; + PADDLE_ENFORCE(is_train || !is_backward, + "Cannot backward when is not training"); + if (!is_backward_) { + PADDLE_ENFORCE(scopes->empty()); + scopes->reserve(static_cast(num_step_scopes)); + for (size_t i = 0; i < num_step_scopes; ++i) { + scopes->emplace_back(&parent.NewScope()); + } + } + } + + framework::Scope &CurScope() { return GetScope(counter_); } + + framework::Scope &ExScope() { + auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1); + return scope; + } + + void Next() { + if (is_backward_) { + --counter_; + } else { + ++counter_; + } + } + + private: + framework::Scope &GetScope(size_t scope_id) const { + if (!is_train_) { + scope_id %= 2; + } + PADDLE_ENFORCE_LT(scope_id, scopes_->size()); + return *(*scopes_)[scope_id]; + } + + size_t counter_; + StepScopeVar *scopes_; + bool is_train_; + bool is_backward_; +}; + +// Base class for RecurrentOp/RecurrentGradOp +// Some common protected functions for RecurrentOp/RecurrentGradOp +class RecurrentBase : public framework::OperatorBase { + public: + RecurrentBase(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + // Get SequenceLength from Scope + // The sequence length is got from input tensor. The input tensor's + // dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape + // is SEQ_LEN. The second of the tensor's shape could be the batch size or + // nested sequence length. + int64_t GetSequenceLength(const framework::Scope &scope) const { + // Dim format SEQ_LEN, BATCH_SIZE, ... + int64_t seq_len = -1; + auto &all_inputs = Inputs(kInputs); + PADDLE_ENFORCE(!all_inputs.empty()); + for (auto &iname : all_inputs) { + auto *var = scope.FindVar(iname); + PADDLE_ENFORCE(var != nullptr); + PADDLE_ENFORCE(var->IsType()); + auto &dim = var->Get().dims(); + if (seq_len == -1) { + seq_len = dim[0]; + } else { + PADDLE_ENFORCE_EQ(seq_len, dim[0]); + } + } + return seq_len; + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.Var, dst_vars)): + // dst_tensor.ShareDataWith(src_tensor) + static void LinkTensor(const framework::Scope &src_scope, + const std::vector &src_vars, + framework::Scope *dst_scope, + const std::vector &dst_vars) { + LinkTensorWithCallback( + src_scope, src_vars, dst_scope, dst_vars, + [&](const framework::Tensor &src, framework::Tensor *dst) { + dst->ShareDataWith(src); + }); + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.Var, dst_vars)): + // callback(src_tensor, &dst_tensor) + template + static void LinkTensorWithCallback(const framework::Scope &src_scope, + const std::vector &src_vars, + framework::Scope *dst_scope, + const std::vector &dst_vars, + Callback callback) { + PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); + for (size_t i = 0; i < dst_vars.size(); ++i) { + VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + } + } + + // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars), + // map(dst_scope.FindVar, dst_vars)): + // callback(src_tensor, &dst_tensor) + template + static void LinkTensorWithCallback(const framework::Scope &src_scope, + const std::vector &src_vars, + const framework::Scope &dst_scope, + const std::vector &dst_vars, + Callback callback) { + PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size()); + for (size_t i = 0; i < dst_vars.size(); ++i) { + VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i]; + AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback); + } + } + + // (seq_len, shape) -> return [seq_len] + list(shape) + static framework::DDim PrependDims(size_t seq_len, + const framework::DDim &src) { + auto dims = framework::vectorize(src); + dims.insert(dims.begin(), static_cast(seq_len)); + return framework::make_ddim(dims); + } + + private: + template + static void AccessTensor(const framework::Scope &src_scope, + const std::string &src_var_name, + framework::Scope *dst_scope, + const std::string &dst_var_name, Callback callback) { + auto *src_var = src_scope.FindVar(src_var_name); + PADDLE_ENFORCE(src_var != nullptr); + auto &src_tensor = src_var->Get(); + + auto *dst_var = dst_scope->Var(dst_var_name); + auto *dst_tensor = dst_var->GetMutable(); + callback(src_tensor, dst_tensor); + } + + template + static void AccessTensor(const framework::Scope &src_scope, + const std::string &src_var_name, + const framework::Scope &dst_scope, + const std::string &dst_var_name, Callback callback) { + auto *src_var = src_scope.FindVar(src_var_name); + PADDLE_ENFORCE(src_var != nullptr); + auto &src_tensor = src_var->Get(); + auto *dst_var = dst_scope.FindVar(dst_var_name); + PADDLE_ENFORCE(dst_var != nullptr); + auto *dst_tensor = dst_var->GetMutable(); + callback(src_tensor, dst_tensor); + } +}; + +class RecurrentOp : public RecurrentBase { + public: + RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : RecurrentBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto seq_len = static_cast(this->GetSequenceLength(scope)); + VLOG(3) << "Static RNN input sequence length = " << seq_len; + StepScopes scopes = CreateStepScopes(scope, seq_len); + auto reverse = Attr(kReverse); + + framework::Executor executor(place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + for (size_t i = 0; i < seq_len; ++i) { + size_t seq_offset = reverse ? seq_len - i - 1 : i; + VLOG(3) << "Recurrent operate at the time step " << seq_offset; + + auto &cur_scope = scopes.CurScope(); + + // Link outside::input --> inside::input + // inside::input = outside::input[seq_offset: seq_offset+1] + LinkTensorWithCallback( + scope, Inputs(kInputs), &cur_scope, Inputs(kInputs), + [&seq_offset](const framework::Tensor &outside, + framework::Tensor *inside) { + inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1)); + auto dims = framework::vectorize(inside->dims()); + dims.erase(dims.begin()); + inside->Resize(framework::make_ddim(dims)); + }); + + if (i == 0) { + // Link initial states --> ex_states + LinkTensor(scope, Inputs(kInitialStates), &cur_scope, + Attr>(kExStates)); + } else { + auto &ex_scope = scopes.ExScope(); + // Link ex_scope::state --> cur_scope::ex_state + LinkTensor(ex_scope, Attr>(kStates), + &cur_scope, Attr>(kExStates)); + } + + // Every inputs are linked now, execute! + executor.Run(*program, &cur_scope, block->ID(), + false /*create_local_scope*/); + + // get device context from pool + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // Copy inside::output -> outside::output + // outside::output[seq_offset: seq_offset + 1] = inside::output + this->LinkTensorWithCallback( + cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + if (i == 0) { // create output tensor at begin + dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims())); + dst_tensor->mutable_data(place, src_tensor.type()); + } + + auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); + // Explicit copy output since the local RNN scope can be destroyed + // early. + framework::Copy(src_tensor, place, dev_ctx, &dst_out); + }); + + scopes.Next(); + } + } + + private: + StepScopes CreateStepScopes(const framework::Scope &scope, + size_t seq_len) const { + auto *var = scope.FindVar(Output(kStepScopes)); + PADDLE_ENFORCE(var != nullptr); + return StepScopes(scope, var->GetMutable(), + Attr(kIsTrain), seq_len); + } +}; + +class RecurrentGradOp : public RecurrentBase { + public: + RecurrentGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : RecurrentBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto seq_len = static_cast(GetSequenceLength(scope)); + StepScopes scopes = CreateStepScopes(scope, seq_len); + auto reverse = Attr(kReverse); + + framework::Executor executor(place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t step_id = 0; step_id < seq_len; ++step_id) { + size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; + VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; + auto &cur_scope = scopes.CurScope(); + // Link outside::output_grads --> inside::output_grads + // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] + LinkTensorWithCallback( + scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads), + [&](const framework::Tensor &outside, framework::Tensor *inside) { + inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1)); + auto dims = framework::vectorize(inside->dims()); + dims.erase(dims.begin()); + inside->Resize(framework::make_ddim(dims)); + }); + auto og_set = List2Set(Inputs(kOutputGrads)); + + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + std::copy(og_set.begin(), og_set.end(), + std::ostream_iterator(sout, ",")); + VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; + } + + // Link states + // if cur_scope::cur_state_grad in out_grads: + // cur_scope::cur_state_grad += ex_scope::ex_state_grad + // else: + // ex_scope::ex_state_grad --> cur_scope::cur_state_grad + if (step_id != 0) { // not at beginning + auto &ex_scope = scopes.ExScope(); + auto ex_state_grads = + GradVarLists(Attr>(kExStates)); + auto cur_state_grads = + GradVarLists(Attr>(kStates)); + + PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size()); + for (size_t i = 0; i < ex_state_grads.size(); ++i) { + auto &cur_grad = cur_state_grads[i]; + auto &ex_grad = ex_state_grads[i]; + auto &ex_tensor = + ex_scope.FindVar(ex_grad)->Get(); + + VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; + auto *cur_grad_var = cur_scope.Var(cur_grad); + auto cur_grad_tensor = + cur_grad_var->GetMutable(); + framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor); + } + } + + VLOG(5) << "Recurrent memory linking finished "; + // Run step block with cur_scope + executor.Run(*program, &cur_scope, block->ID(), + false /*create_local_scope*/); + + VLOG(5) << "executor.Run finished "; + + auto local_var_names = LocalVarNames(cur_scope); + + // Accumulate params + // if (step == 0): + // outside::param_grad = 0.0 + // outside::param_grad += inside::param_grad + { + auto &pg_names = Outputs(kParamGrads); + auto &p_names = Inputs(kParameters); + PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); + + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + auto inside_grad_name = framework::GradVarName(p_names[param_id]); + + // If does not compute gradient of that variable inside rnn, just + // continue + if (local_var_names.find(inside_grad_name) == local_var_names.end()) { + continue; + } + + // zero gradient variable in step 0 + if (step_id == 0) { + auto &inside_tensor = cur_scope.FindVar(inside_grad_name) + ->Get(); + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["value"] = 0.0f; + + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); + zero_op->Run(scope, place); + } + + auto new_inside_name = cur_scope.Rename(inside_grad_name); + // sum gradient + + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + sum_op->Run(cur_scope, place); + + cur_scope.Rename(new_inside_name, inside_grad_name); + } + } + VLOG(5) << "Accumulate Parameter finished "; + + // Copy input gradient from inside to outside + // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad + LinkTensorWithCallback( + cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + if (inside.memory_size() == 0) { // IG is not created. + return; + } + if (step_id == 0) { // alloc memory + outside->Resize(PrependDims(seq_len, inside.dims())); + outside->mutable_data(place, inside.type()); + } + + auto dst = outside->Slice(seq_offset, seq_offset + 1); + framework::Copy(inside, place, dev_ctx, &dst); + }); + VLOG(5) << "Link outside gradient finished "; + + if (step_id + 1 == seq_len) { // at_end + // copy initialize states gradient from inside to outside + LinkTensorWithCallback( + cur_scope, GradVarLists(Attr>(kExStates)), + scope, Outputs(kInitStateGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + outside->Resize(inside.dims()); + outside->mutable_data(place, inside.type()); + framework::Copy(inside, place, dev_ctx, outside); + }); + VLOG(5) << "Link initialize state gradient finished "; + } + scopes.Next(); + } + } + + private: + StepScopes CreateStepScopes(const framework::Scope &scope, + size_t seq_len) const { + auto *var = scope.FindVar(Input(kStepScopes)); + PADDLE_ENFORCE(var != nullptr); + return StepScopes(scope, var->GetMutable(), + Attr(kIsTrain), seq_len, true /*is_backward*/); + } + + std::unordered_set List2Set( + const std::vector &list) const { + std::unordered_set local_var_name_set; + local_var_name_set.reserve(list.size()); + for (auto &each : list) { + local_var_name_set.insert(each); + } + return local_var_name_set; + } + + std::unordered_set LocalVarNames( + const framework::Scope &scope) const { + return this->List2Set(scope.LocalVarNames()); + } + static std::vector GradVarLists( + const std::vector &var_names) { + std::vector retv; + retv.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv), + framework::GradVarName); + return retv; + } +}; + +class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kInputs, "rnn inputs").AsDuplicable(); + AddInput(kInitialStates, "rnn initial states").AsDuplicable(); + AddInput(kParameters, + "Parameters are used by step block as its input. However, the " + "input is not a sequence tensor. Every time step, each operator " + "in step block just use the parameter directly.") + .AsDuplicable(); + AddOutput(kOutputs, + "The output sequence of RNN. The sequence length must be same.") + .AsDuplicable(); + AddOutput(kStepScopes, + "StepScopes contain all local variables in each time step."); + AddAttr>(kExStates, + string::Sprintf( + R"DOC(The ex-state variable names. +The ex-state means the state value in the ex-timestep or the previous time step +[%s, %s, %s] must be the same order)DOC", + kExStates, kStates, kInitStateGrads)); + AddAttr>( + kStates, + string::Sprintf( + "The state variable names. [%s, %s, %s] must be the same order", + kExStates, kStates, kInitStateGrads)); + AddAttr(kStepBlock, "The step block inside RNN"); + AddAttr(kReverse, R"DOC(Calculate RNN reversely or not. +By default reverse=False + +Assume the input data is [A, B, C, D] + +if reverse is False: + the computation of RNN is like + A B C D + | | | | + v v v v + rnn -----> rnn -----> rnn ----> rnn + | | | | + v v v v + o o o o + +if reverse is True + the computation of RNN is like + A B C D + | | | | + v v v v + rnn <----- rnn <----- rnn <---- rnn + | | | | + v v v v + o o o o +)DOC").SetDefault(false); + AddAttr(kIsTrain, "").SetDefault(true); + AddComment(R"DOC( +Static Length Recurrent Operator. + +The static length recurrent operator can only operate on fixed size sequence +data, i.e. in each mini-batch, the sequence length of all inputs are the same. + +)DOC"); + } +}; + +class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + virtual std::unique_ptr Apply() const { + auto *grad = new framework::OpDesc(); + grad->SetType("recurrent_grad"); + for (auto &input_param : this->InputNames()) { + grad->SetInput(input_param, this->Input(input_param)); + grad->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, false)); + } + + for (auto &output_param : this->OutputNames()) { + if (output_param == kStepScopes) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->Output(output_param)); + } else { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(framework::GradVarName(output_param), + this->OutputGrad(output_param)); + } + } + grad->SetAttrMap(this->Attrs()); + grad->SetBlockAttr(kStepBlock, *grad_block_[0]); + + return std::unique_ptr(grad); + } +}; + +class RecurrentGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + std::vector input{kInputs, kInitialStates}; + std::vector output{kOutputs}; + for (auto &s : input) { + PADDLE_ENFORCE(ctx->HasInputs(s)); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)), + "Cannot find the gradient variable %s", + framework::GradVarName(s)); + } + for (auto &s : output) { + PADDLE_ENFORCE(ctx->HasInputs(s)); + } + for (auto &s : input) { + ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); + } + if (ctx->HasInputs(kParameters)) { + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); + ctx->SetOutputsDim(framework::GradVarName(kParameters), + ctx->GetInputsDim(kParameters)); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp, + paddle::operators::RecurrentOpProtoMaker, + paddle::operators::RecurrentGradOpDescMaker); +REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp, + paddle::operators::RecurrentGradOpShapeInference); diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c093f60ceed4171ee4ab7f0e5757af2ee5950270 --- /dev/null +++ b/paddle/fluid/operators/recv_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { + +class RecvOp : public framework::OperatorBase { + public: + RecvOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + auto outs = Outputs("Out"); + std::vector epmap = Attr>("epmap"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i]; + client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + } + PADDLE_ENFORCE(client_.Wait()); + } + + private: + mutable detail::RPCClient client_; +}; + +class RecvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); + AddComment(R"DOC( +Recv operator + +This operator can get variables from server side. +)DOC"); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4d9d4cc07b1f76ed04e17bc1cc65293163fb6f2 --- /dev/null +++ b/paddle/fluid/operators/reduce_op.cc @@ -0,0 +1,214 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reduce_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class ReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReduceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReduceOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + int dim = ctx->Attrs().Get("dim"); + if (dim < 0) dim = x_rank + dim; + PADDLE_ENFORCE_LT( + dim, x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = vectorize(x_dims); + if (keep_dim || x_rank == 1) { + dims_vector[dim] = 1; + } else { + dims_vector.erase(dims_vector.begin() + dim); + } + auto out_dims = framework::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dim != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported."); + int dim = ctx->Attrs().Get("dim"); + if (dim < 0) dim = x_rank + dim; + PADDLE_ENFORCE_LT( + dim, x_rank, + "The dim should be in the range [-rank(input), rank(input))."); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); + } + } +}; + +class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr( + "dim", + "(int, default 0) The dimension to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim < 0`, the dim to reduce is `rank + dim`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault(0); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + comment_ = R"DOC( +{ReduceOp} Operator. + +This operator computes the {reduce} of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. + +)DOC"; + AddComment(comment_); + } + + protected: + std::string comment_; + + void Replace(std::string &src, std::string from, std::string to) { + std::size_t len_from = std::strlen(from.c_str()); + std::size_t len_to = std::strlen(to.c_str()); + for (std::size_t pos = src.find(from); pos != std::string::npos; + pos = src.find(from, pos + len_to)) { + src.replace(pos, len_from, to); + } + } + + void SetComment(std::string name, std::string op) { + Replace(comment_, "{ReduceOp}", name); + Replace(comment_, "{reduce}", op); + } +}; + +class ReduceSumOpMaker : public ReduceOpMaker { + public: + ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceSum", "sum"); + AddComment(comment_); + } +}; + +class ReduceMeanOpMaker : public ReduceOpMaker { + public: + ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMean", "mean"); + AddComment(comment_); + } +}; + +class ReduceMaxOpMaker : public ReduceOpMaker { + public: + ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMax", "max"); + AddComment(comment_); + } +}; + +class ReduceMinOpMaker : public ReduceOpMaker { + public: + ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceMin", "min"); + AddComment(comment_); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad, + ops::ReduceGradOp); + +REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker, + reduce_mean_grad, ops::ReduceGradOp); + +REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, + ops::ReduceGradOp); + +REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, + ops::ReduceGradOp); + +#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL(reduce_type, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel); \ + REGISTER_OP_CPU_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1ca107ebfe9b617bd5e952965543549a8d92a5b1 --- /dev/null +++ b/paddle/fluid/operators/reduce_op.cu @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/reduce_op.h" + +namespace ops = paddle::operators; + +#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type, ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel, \ + ops::ReduceKernel); \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel, \ + ops::ReduceGradKernel); + +FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a153cf272b5dd8abcba1bc7d3d02c480702eae4d --- /dev/null +++ b/paddle/fluid/operators/reduce_op.h @@ -0,0 +1,257 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "glog/logging.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +template +using EigenTensor = framework::EigenTensor; +template +using EigenScalar = framework::EigenScalar; +template +using EigenVector = framework::EigenVector; + +struct SumFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.sum(dim); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + dx.device(place) = dy.broadcast(dim); + } +}; + +struct MeanFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.mean(dim); + } +}; + +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + dx.device(place) = dy.broadcast(dim) / dx.constant(size); + } +}; + +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.maximum(dim); + } +}; + +struct MinFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.minimum(dim); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + auto equals = x == y.broadcast(dim); + auto ones = dx.constant(1); + auto zeros = dx.constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros); + } +}; + +template +class ReduceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + if (reduce_all) { + // Flatten and reduce 1-D tensor + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + auto x = EigenVector::Flatten(*input); + auto out = EigenScalar::From(*output); + auto& place = + *context.template device_context().eigen_device(); + auto reduce_dim = Eigen::array({{0}}); + Functor functor; + functor(place, x, out, reduce_dim); + } else { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + ReduceCompute<1>(context); + break; + case 2: + ReduceCompute<2>(context); + break; + case 3: + ReduceCompute<3>(context); + break; + case 4: + ReduceCompute<4>(context); + break; + case 5: + ReduceCompute<5>(context); + break; + case 6: + ReduceCompute<6>(context); + break; + } + } + } + + private: + template + void ReduceCompute(const framework::ExecutionContext& context) const { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + + auto x = EigenTensor::From(*input); + auto x_rank = static_cast(x.dimensions().size()); + int dim = static_cast(context.Attr("dim")); + if (dim < 0) dim = x_rank + dim; + auto reduce_dim = Eigen::array({{dim}}); + // construct the squeezed output tensor + bool keep_dim = context.Attr("keep_dim"); + DDim dims = output->dims(); + auto dims_vector = vectorize(dims); + if (keep_dim && x_rank > 1) { + dims_vector.erase(dims_vector.begin() + dim); + dims = framework::make_ddim(dims_vector); + } + + auto& place = + *context.template device_context().eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, x, out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, dims); + functor(place, x, out, reduce_dim); + } + } +}; + +template +class ReduceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + if (reduce_all) { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + auto x = EigenVector::Flatten(*input0); + auto x_reduce = EigenVector::From(*input1); + auto x_reduce_grad = EigenVector::From(*input2); + auto x_grad = EigenVector::Flatten(*output); + auto& place = + *context.template device_context().eigen_device(); + auto broadcast_dim = + Eigen::array({{static_cast(input0->numel())}}); + Functor functor; + functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, + broadcast_dim[0]); + } else { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + ReduceGradCompute<1>(context); + break; + case 2: + ReduceGradCompute<2>(context); + break; + case 3: + ReduceGradCompute<3>(context); + break; + case 4: + ReduceGradCompute<4>(context); + break; + case 5: + ReduceGradCompute<5>(context); + break; + case 6: + ReduceGradCompute<6>(context); + break; + } + } + } + + private: + template + void ReduceGradCompute(const framework::ExecutionContext& context) const { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Out"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + + output->mutable_data(context.GetPlace()); + auto x = EigenTensor::From(*input0); + auto x_grad = EigenTensor::From(*output); + auto x_rank = static_cast(x.dimensions().size()); + int dim = static_cast(context.Attr("dim")); + if (dim < 0) dim = x_rank + dim; + DDim dims = input0->dims(); + dims[dim] = 1; + auto x_reduce = EigenTensor::From(*input1, dims); + auto x_reduce_grad = EigenTensor::From(*input2, dims); + + Eigen::array broadcast_dim; + for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; + broadcast_dim[dim] = input0->dims()[dim]; + auto& place = + *context.template device_context().eigen_device(); + Functor functor; + functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, + broadcast_dim[dim]); + } +}; + +} // namespace operators +} // namespace paddle + +#define FOR_EACH_KERNEL_FUNCTOR(__macro) \ + __macro(reduce_sum, SumFunctor, SumGradFunctor); \ + __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ + __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ + __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..148a65bb4b7fe599f2fdb833c179665e58fe1c41 --- /dev/null +++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc @@ -0,0 +1,270 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +class ReorderLoDTensorByRankTableOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto, + OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input lod tensor to be reordered according to " + "Input(RankTable)."); + AddInput("RankTable", + "(LoDRankTable), the rank table according to which Input(X) is " + "reordered."); + AddOutput("Out", "(LoDTensor), the reordered lod tensor."); + AddComment(R"DOC(ReorderLoDTensorByRankTable operator. + +Input(X) is a batch of sequences. Input(RankTable) stores new orders of the +input sequence batch. The reorder_lod_tensor_by_rank operator reorders the +Input(X) according to the information provided by Input(RankTable). + +For example: + +If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the +Input(X) will be reordered that the fourth sequence in Input(X) will become the +first one, and then followed by the original first, third, and the second one. + +This is: +X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1]. +Out = [Seq3, Seq0, Seq2, Seq1] with a new LoD information. + +If the LoD information of Input(X) is empty, this means Input(X) is not sequence +data. This is also identical to a batch of sequences where each sequence has a +fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders +each slice of Input(X) along the first axis according to Input(RankTable). + +This is: +X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The +indices in RankTable are [3, 0, 2, 1]. +Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended. + +NOTE: This operator sorts Input(X) according to a given LoDRankTable which does +not need to be calculated according to Input(X). It can be calculated according +to another different sequence, and then this operator sorts Input(X) according +to the given LoDRankTable. + +)DOC"); + } +}; + +class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { + public: + ReorderLoDTensorByRankTableBase(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = + detail::Ref(scope.FindVar(Input("X")), + "Cannot find input lod tensor variable %s", Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")), + "Cannot find input rank table variable %s", + Input("RankTable")) + .Get(); + auto &out = + *detail::Ref(scope.FindVar(Output("Out")), + "Cannot find output lod tensor variable %s", Output("Out")) + .GetMutable(); + + out.Resize(x.dims()); + out.mutable_data(x.place(), x.type()); + this->process(place, x, rank_table, &out); + } + + protected: + virtual void process(const platform::Place &place, + const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const = 0; + + struct AbsoluteRankTableItem { + size_t offset; // the absolute/accumulated offset. + size_t length; // the length + framework::LoD lod; + }; + + std::vector GetAbsoluteOffsetAndLengthByLoDRankTable( + const framework::LoDTensor &x) const { + std::vector absolute_table; + + if (x.lod().empty()) { + // For Tensor without lod, such as the output of sequence_pool_op + size_t size = x.dims()[0]; + absolute_table.reserve(size); + for (size_t i = 0; i < size; ++i) { + absolute_table.emplace_back(); + absolute_table.back().length = 1; + absolute_table.back().offset = i; + } + } else { + size_t level = 0; + size_t size = x.lod()[level].size(); + + for (size_t i = 0; i < size - 1; ++i) { + auto lod_offset = + framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level); + + auto &offset = lod_offset.second; + + absolute_table.emplace_back(); + absolute_table.back().length = offset.second - offset.first; + absolute_table.back().offset = offset.first; + absolute_table.back().lod = lod_offset.first; + } + } + + return absolute_table; + } + + size_t CopyTensorAndLod(const platform::Place &place, + const AbsoluteRankTableItem &item, + const framework::LoDTensor &x, + framework::LoDTensor *out, size_t out_offset) const { + auto &out_lod = *out->mutable_lod(); + auto len = item.length; + auto x_offset = item.offset; + + if (out_lod.empty()) { + for (size_t i = 0; i < item.lod.size(); ++i) { + out_lod.push_back(std::vector({0})); + } + } + + for (size_t i = 0; i < out_lod.size(); ++i) { + auto &out_v = out_lod[i]; + auto &new_lod_v = item.lod[i]; + + for (auto &detail : new_lod_v) { + out_v.push_back(out_v.back() + detail); + } + } + + auto x_sliced = x.Slice(x_offset, x_offset + len); + auto out_sliced = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); + out_offset += len; + return out_offset; + } +}; + +class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankTableOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + size_t out_offset = 0; + out->mutable_lod()->clear(); + for (auto &item : rank_table.items()) { + PADDLE_ENFORCE_LT(item.index, absolute_table.size()); + out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out, + out_offset); + } + } +}; + +class IdentityInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ReorderLodTensorByRankGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("reorder_lod_tensor_by_rank_grad"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetInput("RankTable", Input("RankTable")); + return std::unique_ptr(grad_op); + } +}; + +class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + + // offsets = enumerate([item.index for item in rank_table.items()]) + std::vector> offsets; + offsets.reserve(rank_table.items().size()); + for (size_t i = 0; i < rank_table.items().size(); ++i) { + offsets.push_back({i, rank_table.items()[i].index}); + } + + // offsets.sort(key=lambda x: x[1]) + std::sort( + offsets.begin(), offsets.end(), + [](const std::pair &a, + const std::pair &b) { return a.second < b.second; }); + + // Copy TensorAndLod + size_t out_offset = 0; + for (auto &offset : offsets) { + out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first], + x, out, out_offset); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(reorder_lod_tensor_by_rank, + ops::ReorderLoDTensorByRankTableOp, + ops::ReorderLodTensorByRankGradOpMaker, + ops::ReorderLoDTensorByRankTableOpProtoMaker, + ops::IdentityInferShape); +REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad, + ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4f80cc06abaa536d1b1097850047fd370246dee --- /dev/null +++ b/paddle/fluid/operators/reshape_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reshape_op.h" + +namespace paddle { +namespace operators { + +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + // input check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty."); + auto x_dims = ctx->GetInputDim("X"); + + std::vector neg_dims_idx; + // set some dimension to -1 if it is unknown + const int unknown_size = -1; + for (size_t i = 0; i < shape.size(); ++i) { + PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size, + "Each dimension of Attr(shape) must be positive or %d.", + unknown_size); + if (shape[i] == unknown_size) { + neg_dims_idx.push_back(i); + PADDLE_ENFORCE(neg_dims_idx.size() <= 1, + "Only one dimension of Attr(shape) can be unknown."); + } + } + + int64_t capacity = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + int64_t in_size = framework::product(x_dims); + if (neg_dims_idx.size() == 1) { + // dim infer + shape[neg_dims_idx[0]] = in_size / (-capacity); + // recalculate capacity + capacity = shape[neg_dims_idx[0]] * (-capacity); + } + // capacity check + PADDLE_ENFORCE(capacity == in_size, + "The size of Input(X) mismatches with Attr(shape)."); + // resize output + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + auto out_dims = framework::make_ddim(shape_int64); + ctx->SetOutputDim("Out", out_dims); + if (shape[0] == x_dims[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of reshape operator."); + AddOutput("Out", "The output tensor of reshape operator."); + AddAttr>("shape", + "(vector) " + "Target shape of reshape operator."); + AddComment(R"DOC( +Reshape Operator. + +Reshape Input(X) into the shape specified by Attr(shape). + +An example: +Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] + +and target shape = [1, 4], the reshape operator will transform +the tensor X into a 2-D tensor: [[1, 2, 3, 4]] + +One dimension in the target shape can be set -1, representing that its +size is unknown. In this case, the real dimension will be infered from +the original shape of Input(X) and other dimensions in the target shape. +)DOC"); + } +}; + +class ReshapeGradOp : public framework::OperatorWithKernel { + public: + ReshapeGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad, + ops::ReshapeGradOp); +REGISTER_OP_CPU_KERNEL(reshape, + ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL( + reshape_grad, ops::ReshapeGradKernel); diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9ae6da29e54187b2d6aedb833a2aa4ca95cacba --- /dev/null +++ b/paddle/fluid/operators/reshape_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/reshape_op.h" + +REGISTER_OP_CUDA_KERNEL( + reshape, + paddle::operators::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL( + reshape_grad, + paddle::operators::ReshapeGradKernel); diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a17ba7c619490977b837c565ef1f4cc0780d5c61 --- /dev/null +++ b/paddle/fluid/operators/reshape_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ReshapeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out = ctx.Output("Out"); + auto* in = ctx.Input("X"); + auto out_dims = out->dims(); + out->mutable_data(ctx.GetPlace()); + framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out); + out->Resize(out_dims); + } +}; + +template +class ReshapeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); + + auto in_dims = d_x->dims(); + framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); + d_x->Resize(in_dims); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..06d3ccafefd4cc163b806aeb5d2a582c686f10cb --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/rmsprop_op.h" + +namespace paddle { +namespace operators { + +class RmspropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("MeanSquare"), + "Input(MeanSquare) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of RmspropOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(param_out) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(Momentum_out) of RmspropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), + "Output(MeanSquareOut) of RmspropOp should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and grad input of RmspropOp should have the same dimension."); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"), + "Param and Momentum input of RmspropOp " + "should have the same dimension."); + PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"), + "Param and Momentum input of RmspropOp " + "should have the same dimension."); + + auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, + "Learning Rate should be a scalar."); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("MomentOut", param_dim); + ctx->SetOutputDim("MeanSquareOut", param_dim); + } +}; + +class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter value that has to be updated."); + AddInput("MeanSquare", + "(Tensor, default Tensor)" + " The mean square value that gets updated."); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "The learning rate should be a tensor of size 1."); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter."); + AddInput("Moment", + "(Tensor, default Tensor) The moment that gets updated."); + + AddOutput("ParamOut", "(Tensor) Output updated parameter value."); + AddOutput("MomentOut", "(Tensor) Output updated moment."); + AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); + + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") + .SetDefault(1.0e-10f); + AddAttr("decay", + "(float, default 0.9) " + "Discounting factor for coming gradient.") + .SetDefault(0.9f); + AddAttr("momentum", "(float, default 0.0) Constant value.") + .SetDefault(0.0f); + AddComment(R"DOC( +Rmsprop Optimizer. + +$$ +MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\ +MomentOut = momentum * Moment + + \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\ +ParamOut = Param - MomentOut +$$ + +The original slides that proposed Rmsprop: Slide 29 of +http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); +REGISTER_OP_CPU_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/fluid/operators/rmsprop_op.cu b/paddle/fluid/operators/rmsprop_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a909c942791d2e2e4d9887d4c9265383a93ca137 --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/rmsprop_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h new file mode 100644 index 0000000000000000000000000000000000000000..469c102a4721ca45026112e3166dc0807ba93292 --- /dev/null +++ b/paddle/fluid/operators/rmsprop_op.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class RmspropOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param_out = ctx.Output("ParamOut"); + auto* moment_out = ctx.Output("MomentOut"); + auto* mean_square_out = ctx.Output("MeanSquareOut"); + + auto grad = ctx.Input("Grad"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + mean_square_out->mutable_data(ctx.GetPlace()); + + float epsilon = ctx.Attr("epsilon"); + float rho = ctx.Attr("decay"); + float momentum = ctx.Attr("momentum"); + + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); + auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); + auto g = EigenVector::Flatten(*grad); + auto mom = EigenVector::Flatten(*ctx.Input("Moment")); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + auto& place = *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + mom_out.device(place) = + momentum * mom + + lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + p_out.device(place) = p - mom_out; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..504456c4b069f81319893ae51f57503f5025761a --- /dev/null +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class RNNMemoryHelperOp : public framework::OperatorBase { + public: + RNNMemoryHelperOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto mem_var_name = Input("X"); + auto *mem_var = scope.FindVar(mem_var_name); + PADDLE_ENFORCE(mem_var != nullptr, + "Cannot find mem_var in scope, mem_var_name is %s", + mem_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto *out_tensor = out_var->GetMutable(); + auto &mem_tensor = mem_var->Get(); + out_tensor->ShareDataWith(mem_tensor); + out_tensor->set_lod(mem_tensor.lod()); + } +}; + +class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Out"), ""); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddOutput("Out", ""); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(""); + } +}; + +class RNNMemoryHelperGradOp : public framework::OperatorBase { + public: + RNNMemoryHelperGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto out_grad_var_name = Input(framework::GradVarName("Out")); + auto *out_grad_var = scope.FindVar(out_grad_var_name); + + auto in_grad_var_name = Output(framework::GradVarName("X")); + auto *in_grad_var = scope.FindVar(in_grad_var_name); + PADDLE_ENFORCE(in_grad_var != nullptr, + "Cannot find in_grad_var in scope, name is %s", + in_grad_var_name); + + if (out_grad_var == nullptr) { + VLOG(5) << "Using fill constant 0 as starting gradient"; + auto in_var_name = Input("X"); + auto *in_var = scope.FindVar(in_var_name); + auto &in_var_tensor = in_var->Get(); + + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(in_var_tensor.type()); + attrs["shape"] = framework::vectorize2int(in_var_tensor.dims()); + attrs["value"] = 0.0f; + + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs); + zero_op->Run(scope, dev_place); + } else { + auto &out_grad_tensor = out_grad_var->Get(); + auto *in_grad_tensor = in_grad_var->GetMutable(); + in_grad_tensor->ShareDataWith(out_grad_tensor); + in_grad_tensor->set_lod(out_grad_tensor.lod()); + } + } +}; + +class RNNMemoryHelperGradOpInfoMaker + : public framework::OpProtoAndCheckerMaker { + public: + RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(framework::GradVarName("Out"), ""); + AddInput("X", ""); + AddInput("Out", ""); + AddOutput(framework::GradVarName("X"), ""); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::DataType::FP32); + AddComment(""); + } +}; + +class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + auto x_grad_name = framework::GradVarName("X"); + PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ x_grad_name); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp, + paddle::operators::RNNMemoryHelperOpInfoMaker, + paddle::operators::RNNMemoryHelperOpShapeInference, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(rnn_memory_helper_grad, + paddle::operators::RNNMemoryHelperGradOp, + paddle::operators::RNNMemoryHelperGradOpInfoMaker, + paddle::operators::RNNMemoryHelperGradOpShapeInference); diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..09238f89a775979b8b1866d410e6ad1ef772f9d7 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_pool_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kROISize = 5; + +class ROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Argmax"), + "Output(Argmax) of ROIPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW."); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]."); + PADDLE_ENFORCE(rois_dims[1] == kROISize, + "ROIs should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]."); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + ctx->SetOutputDim("Out", out_dims); + ctx->SetOutputDim("Argmax", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor), " + "the input of ROIPoolOp. " + "The format of input tensor is NCHW. Where N is batch size, " + "C is the number of input channels, " + "H is the height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(Tensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]. " + "Where batch_id is the id of the data, " + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates."); + AddOutput("Out", + "(Tensor), " + "The output of ROIPoolOp is a 4-D tensor with shape " + "(num_rois, channels, pooled_h, pooled_w)."); + AddOutput("Argmax", + "(Tensor), " + "Argmaxes corresponding to indices in X used " + "for gradient computation. Only output " + "if arg “is_test” is false.") + .AsIntermediate(); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "The pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "The pooled output width.") + .SetDefault(1); + AddComment(R"DOC( +ROIPool operator + +ROI Pooling for Faster-RCNN. The link below is a further introduction: +https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad, + ops::ROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + roi_pool, + ops::CPUROIPoolOpKernel, + ops::CPUROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + roi_pool_grad, + ops::CPUROIPoolGradOpKernel, + ops::CPUROIPoolOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e8fc9ec7a68cffeb45f8ece3e5bde39d1e71e92 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/roi_pool_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 5; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPUROIPoolForward(const int nthreads, const T* input_data, + const int64_t* input_rois, + const float spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, + const int pooled_width, T* output_data, + int64_t* argmax_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = offset_input_rois[0]; + int roi_start_w = round(offset_input_rois[1] * spatial_scale); + int roi_start_h = round(offset_input_rois[2] * spatial_scale); + int roi_end_w = round(offset_input_rois[3] * spatial_scale); + int roi_end_h = round(offset_input_rois[4] * spatial_scale); + + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + T maxval = is_empty ? 0 : -std::numeric_limits::max(); + int maxidx = -1; + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_data_index = h * width + w; + if (offset_input_data[input_data_index] > maxval) { + maxval = offset_input_data[input_data_index]; + maxidx = input_data_index; + } + } + } + output_data[index] = maxval; + if (argmax_data) { + argmax_data[index] = maxidx; + } + } +} + +template +__global__ void GPUROIPoolBackward( + const int nthreads, const int64_t* input_rois, const T* output_grad, + const int64_t* argmax_data, const int num_rois, const float spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = offset_input_rois[0]; + int input_offset = (roi_batch_ind * channels + c) * height * width; + int output_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_output_grad = output_grad + output_offset; + T* offset_input_grad = input_grad + input_offset; + const int64_t* offset_argmax_data = argmax_data + output_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + platform::CudaAtomicAdd( + offset_input_grad + argmax, + static_cast(offset_output_grad[ph * pooled_width + pw])); + } + } +} + +template +class GPUROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + auto in_stride = framework::stride(in_dims); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + size_t rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + GPUROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + channels, height, width, pooled_height, pooled_width, + out->mutable_data(ctx.GetPlace()), + argmax->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* argmax = ctx.Input("Argmax"); + + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + size_t rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (x_grad) { + x_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); + + int output_grad_size = out_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), out_grad->data(), + argmax->data(), rois_num, spatial_scale, channels, height, + width, pooled_height, pooled_width, + x_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + roi_pool, + ops::GPUROIPoolOpKernel, + ops::GPUROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + roi_pool_grad, + ops::GPUROIPoolGradOpKernel, + ops::GPUROIPoolOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15f3b36fcd16bf72b9b09f58a3019b24538eec12 --- /dev/null +++ b/paddle/fluid/operators/roi_pool_op.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPUROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + const int64_t* rois_data = rois->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + PADDLE_ENFORCE_GE(roi_batch_id, 0); + PADDLE_ENFORCE_LT(roi_batch_id, batch_size); + rois_data += roi_stride[0]; + } + + rois_data = rois->data(); + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + int roi_start_w = round(rois_data[1] * spatial_scale); + int roi_start_h = round(rois_data[2] * spatial_scale); + int roi_end_w = round(rois_data[3] * spatial_scale); + int roi_end_h = round(rois_data[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); + int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); + + const float bin_size_h = + static_cast(roi_height) / static_cast(pooled_height); + const float bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const T* batch_data = input_data + roi_batch_id * in_stride[0]; + + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + int hstart = + static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = + static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = + static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = + static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = std::min(std::max(hstart + roi_start_h, 0), height); + hend = std::min(std::max(hend + roi_start_h, 0), height); + wstart = std::min(std::max(wstart + roi_start_w, 0), width); + wend = std::min(std::max(wend + roi_start_w, 0), width); + + const int pool_index = ph * pooled_width + pw; + + // Define an empty pooling region to be zero + bool is_empty = (hend <= hstart) || (wend <= wstart); + output_data[pool_index] = + is_empty ? 0 : -std::numeric_limits::max(); + argmax_data[pool_index] = -1; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (batch_data[index] > output_data[pool_index]) { + output_data[pool_index] = batch_data[index]; + argmax_data[pool_index] = index; + } + } + } + } + } + + batch_data += in_stride[1]; + output_data += out_stride[1]; + argmax_data += argmax_stride[1]; + } + // Increment ROI data pointer + rois_data += roi_stride[0]; + } + return; + } +}; + +template +class CPUROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* argmax = ctx.Input("Argmax"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + + if (in_grad) { + const int64_t* rois_data = rois->data(); + const T* out_grad_data = out_grad->data(); + const int64_t* argmax_data = argmax->data(); + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), in_grad, + static_cast(0)); + + auto in_stride = framework::stride(in->dims()); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out_grad->dims()); + + int rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = rois_data[0]; + T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + if (argmax_data[pool_index] >= 0) { + auto index = argmax_data[pool_index]; + batch_grad_data[index] += out_grad_data[pool_index]; + } + } + } + batch_grad_data += in_stride[1]; + out_grad_data += out_stride[1]; + argmax_data += argmax_stride[1]; + } + rois_data += roi_stride[0]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..92661ea9716a89a66c27fa21543d81b5a280bcdd --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.cc @@ -0,0 +1,259 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/row_conv_op.h" +#include "paddle/fluid/framework/eigen.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +class RowConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of RowConvOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ( + x_dims[1], filter_dims[1], + "The 2nd dimension of Input(X) and Input(Filter) should be same."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class RowConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto filter_grad_name = framework::GradVarName("Filter"); + if (ctx->HasOutput(filter_grad_name)) { + auto filter_dims = ctx->GetInputDim("Filter"); + ctx->SetOutputDim(filter_grad_name, filter_dims); + } + } +}; + +class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input(X) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LoDTensor is a matrix with shape (T x N), where T " + "is the total time steps in this mini-batch and N is the input " + "data dimension."); + AddInput("Filter", + "(Tensor), the input(Filter) is a learnable parameter. It " + "is a 2-D tensor with shape (future_context x N), where, " + "future_context is the future context length and N is the data " + "dimension."); + AddOutput("Out", + "(LoDTensor), the output(Out) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LodTensor is a matrix with shape T x N, i.e., the " + "same shape as X."); + AddComment(R"DOC( +Row-convolution Operator. + +The row convolution is called lookahead convolution. This operator was +introduced in the following paper for DeepSpeech2: +http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf + +The main motivation is that a bidirectional RNN, useful in DeepSpeech +like speech models, learns representation for a sequence by performing a +forward and a backward pass through the entire sequence. However, unlike +unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online +and low-latency setting. The lookahead convolution incorporates information +from future subsequences in a computationally efficient manner to improve +unidirectional recurrent neural networks. The row convolution operator is +different from the 1D sequence convolution, and is computed as follows: + +Given an input sequence $in$ of length $t$ and input dimension $d$, +and a filter ($W$) of size $context \times d$, +the output sequence is convolved as: + +$$ +out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} +$$ + +)DOC"); + } +}; + +template +class RowConvKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *out = context.Output("Out"); + + out->mutable_data(context.GetPlace()); + + auto batch_indices = x->lod()[0]; + auto input_dim = x->dims()[1]; // 'in' is of size T x N + size_t num_sequence = batch_indices.size() - 1; + + auto future_context = filter->dims()[0]; + auto weights = EigenMatrix::From(*filter); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + Tensor cur_input_sequence = + x->Slice(start, end); // Current input sequence + Tensor cur_output_sequence = + out->Slice(start, end); // Current output sequence + auto cip_seq = EigenMatrix::From(cur_input_sequence); + auto cot_seq = EigenMatrix::From(cur_output_sequence); + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + for (int d = 0; d < input_dim; d++) { + if (w == 0) { + cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d); + } else { + cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d); + } + } + } + } + } + } +}; + +template +class RowConvGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *d_out = context.Input(framework::GradVarName("Out")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *d_filter = context.Output(framework::GradVarName("Filter")); + + auto input_dim = x->dims()[1]; // 'x' is of size T x N + auto batch_indices = x->lod()[0]; + size_t num_sequence = batch_indices.size() - 1; + auto future_context = filter->dims()[0]; + + if (d_filter) { + d_filter->mutable_data(context.GetPlace()); + auto dweights = + EigenMatrix::From(*d_filter); // Gradient of weight matrix + dweights.setZero(); + + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_input = x->Slice(start, end); // Current input sequence + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + + auto cur_ip = EigenMatrix::From(cur_input); + auto cur_dout = EigenMatrix::From(cur_doutput); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dweights (Updating the gradient of weight matrix) + for (int d = 0; d < input_dim; d++) { + dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d); + } + } + } + } + } + + if (dx) { + dx->mutable_data(context.GetPlace()); + auto weights = EigenMatrix::From(*filter); + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + Tensor cur_dinput = + dx->Slice(start, end); // Current input grad sequence + + auto cur_dout = EigenMatrix::From(cur_doutput); + auto cur_dip = EigenMatrix::From(cur_dinput); + cur_dip.setZero(); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dinput (Updating the gradient wrt input) + for (int d = 0; d < input_dim; d++) { + cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d); + } + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, + ops::RowConvGradOp); +REGISTER_OP_CPU_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..832072edf810099d142c82930abfd7f198a7d1b8 --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.cu @@ -0,0 +1,410 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/row_conv_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Forward prop (shared memory version, for small future_context) +template +__global__ void RowConvForwardSharedMemory(const T *in, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *out) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (d < input_dim) + ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d] + : static_cast(0); + } + if (d < input_dim) { + out[(start + k) * input_dim + d] = sum; + } + } + } +} + +// Forward prop (naive version) +template +__global__ void RowConvForward(const T *in, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *out) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]); + } + out[(start + k) * input_dim + d] = sum; + } + } +} + +// Compute input gradient (shared memory version, for small future_context) +template +__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *din) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (d < input_dim) + ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d]) + : static_cast(0); + } + if (d < input_dim) { + din[(k + start) * input_dim + d] = sum; + } + } + } +} + +// Compute input gradient (Naive version) +template +__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *din) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]); + } + din[(k + start) * input_dim + d] = sum; + } + } +} + +// Compute W gradient (small future_context version) +template +__global__ void RowConvGradFilterImproved(const T *in, const T *dout, + int num_sequence, int input_dim, + int future_context, int block_x, + int block_y, + const size_t *batch_indices, + T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + + extern __shared__ T mem[]; + + int xdim_sh_in = block_y; + int xdim_sh_dout = block_y; + // int xdim_sh_dfilter = future_context; + int ydim_sh_in = block_x; + int ydim_sh_dout = block_x + future_context - 1; + int ydim_sh_dfilter = block_y; + + T *sh_in = mem; + T *sh_dout = &mem[xdim_sh_in * ydim_sh_in]; + T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout]; + + if (thy < future_context) { + sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * ydim_sh_in + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0); + sh_dout[thx * ydim_sh_dout + thy + future_context - 1] = + (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0); + __syncthreads(); + + if (thy < future_context - 1) { + int pos_offset = pos - future_context + 1; + sh_dout[thx * ydim_sh_dout + thy] = + (d < input_dim && pos_offset >= start) + ? dout[pos_offset * input_dim + d] + : T(0); + } + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + T val = sh_in[thy * ydim_sh_in + thx] * + sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0) { + sh_dfilter[w * ydim_sh_dfilter + thy] += val; + } + __syncthreads(); + } + } + } + for (int w = thy; (w < future_context) && (d < input_dim); w += bly) { + dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx]; + } +} + +// Compute weight(filter) gradient +template +__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, + int input_dim, int future_context, + int block_x, int block_y, + const size_t *batch_indices, T *dfilter) { + int blx = blockDim.x; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + extern __shared__ T mem[]; + T *sh_in = mem; + T *sh_dout = &mem[block_x * block_y]; + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * block_y + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0; + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + sh_dout[thx * block_y + thy] = + (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps) + ? dout[(pos - w) * input_dim + d] + : 0.0; + __syncthreads(); + + T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0 && (gx + thy) < input_dim) { + dfilter[w * input_dim + gx + thy] += val; + } + } + } + } +} + +} // namespace + +template +class RowConvKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *Out = context.Output("Out"); + + const T *in = X->data(); + const T *weight = Filter->data(); + T *out = Out->mutable_data(context.GetPlace()); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvForwardSharedMemory< + T><<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvForward<<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } + } +}; + +template +class RowConvGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *dOut = context.Input(framework::GradVarName("Out")); + const T *in = X->data(); + const T *weights = Filter->data(); + const T *dout = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dFilter = context.Output(framework::GradVarName("Filter")); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + + auto &device_ctx = context.cuda_device_context(); + math::SetConstant zero; + + if (dFilter) { + T *dfilter = dFilter->mutable_data(context.GetPlace()); + zero(device_ctx, dFilter, static_cast(0.0)); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_y * block_x + block_y * (block_x + future_context - 1) + + future_context * block_y) * + sizeof(T); + RowConvGradFilterImproved< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 + RowConvGradFilter< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } + } + + if (dX) { + T *din = dX->mutable_data(context.GetPlace()); + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvGradInputSharedMemory< + T><<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvGradInput<<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/row_conv_op.h b/paddle/fluid/operators/row_conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..59164b5215910630b4641501bc0b0c0e941911c2 --- /dev/null +++ b/paddle/fluid/operators/row_conv_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c23de9073ef965b989e98936b2dd07fc6bce7fdc --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(sidgoyal78): These function are needed by other files (save_op), move +// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class SaveCombineOp : public framework::OperatorBase { + public: + SaveCombineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto inp_var_names = Inputs("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + auto *var = scope.FindVar(inp_var_names[i]); + + PADDLE_ENFORCE(var != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(var->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = var->Get(); + // Serialize tensor + framework::SerializeToStream(fout, tensor, dev_ctx); + } + fout.close(); + } +}; + +class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(vector) Input LoDTensors that need to be saved together in a file.") + .AsDuplicable(); + AddComment(R"DOC( +SaveCombine operator + +This operator will serialize and write a list of input LoDTensor variables +to a file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if it exists.") + .SetDefault(true); + AddAttr( + "file_path", + "(string)" + "The \"file_path\" where the LoDTensor variables will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, + ops::SaveCombineOpProtoMaker); diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8325bac6bc59602e81d38cb857b7b8e133be2cc --- /dev/null +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" + +USE_NO_KERNEL_OP(save_combine); +USE_NO_KERNEL_OP(load_combine); + +int* CreateForSaveCombineOp(int x, int y, const std::vector& lod_info, + std::string var_name, + paddle::platform::CPUPlace& place, + paddle::framework::Scope& scope, + paddle::framework::LoD& expect_lod) { + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({x, y}); + expect_lod.resize(1); + for (size_t i = 0; i < lod_info.size(); i++) { + expect_lod[0].push_back(lod_info[i]); + } + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + return expect; +} + +paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad( + const std::string out_var_name, paddle::framework::Scope& scope) { + auto load_var = scope.Var(out_var_name); + auto target = load_var->GetMutable(); + return target; +} + +int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target, + paddle::framework::Scope& scope, + paddle::framework::LoD& actual_lod) { + int* actual = target->data(); + actual_lod = target->lod(); + return actual; +} + +void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod, + paddle::framework::LoD actual_lod, const int& numel) { + for (int64_t i = 0; i < numel; ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} + +// Here, we create 4 LoDTensors and use save_combine_op to first save these +// in a single file. Then, we use load_combine_op to load these sequentially +TEST(SaveLoadCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + std::vector lod1 = {0, 1, 2, 3, 10}; + int numel1 = 100; + paddle::framework::LoD expect_lod1; + int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope, + expect_lod1); + + std::vector lod2 = {0, 2, 5, 10}; + int numel2 = 200; + paddle::framework::LoD expect_lod2; + int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope, + expect_lod2); + + std::vector lod3 = {0, 2, 3, 20}; + int numel3 = 4000; + paddle::framework::LoD expect_lod3; + int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place, + scope, expect_lod3); + + std::vector lod4 = {0, 1, 20}; + int numel4 = 1000; + paddle::framework::LoD expect_lod4; + int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope, + expect_lod4); + + // Set attributes + std::string filename = "check_tensor.ls"; + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string(filename)}); + + // Run the save_combine_op + auto save_combine_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", + {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs); + save_combine_op->Run(scope, place); + + // Set up output vars + auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope); + auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope); + auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope); + auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope); + + // Run the load_combine_op + auto load_combine_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, + {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs); + load_combine_op->Run(scope, place); + + paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4; + int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1); + int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2); + int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3); + int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4); + + CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1); + CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2); + CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3); + CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4); +} + +// Test with original SaveLoadTest +TEST(SaveLoadTestWithCombineOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("check_t.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save_combine", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + int* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + auto& actual_lod = target->lod(); + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..da4573a8ed936cf607123590ca41fb8f630930f3 --- /dev/null +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" + +USE_NO_KERNEL_OP(save); +USE_NO_KERNEL_OP(load); + +TEST(SaveLoadOp, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + paddle::framework::LoD expect_lod; + expect_lod.resize(1); + expect_lod[0].push_back(0); + expect_lod[0].push_back(1); + expect_lod[0].push_back(2); + expect_lod[0].push_back(3); + + tensor->set_lod(expect_lod); + int* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("tensor.save")}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + int* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], actual[i]); + } + auto& actual_lod = target->lod(); + EXPECT_EQ(expect_lod.size(), actual_lod.size()); + for (size_t i = 0; i < expect_lod.size(); ++i) { + for (size_t j = 0; j < expect_lod[i].size(); ++j) { + EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]); + } + } +} diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..483cdfa4c3b9e3b9abd3f32bc5e6e5e0b493bd23 --- /dev/null +++ b/paddle/fluid/operators/save_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +// TODO(yuyang18): If the functions below are needed by other files, move them +// to paddle::filesystem namespace. +constexpr char kSEP = '/'; +static bool FileExists(const std::string &filepath) { + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + if (mkdir(path, 0755)) { + PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); + } +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} + +class SaveOp : public framework::OperatorBase { + public: + SaveOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto filename = Attr("file_path"); + auto overwrite = Attr("overwrite"); + + if (FileExists(filename) && !overwrite) { + PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto iname = Input("X"); + auto *var = scope.FindVar(iname); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", + iname); + + PADDLE_ENFORCE(var->IsType(), + "SaveOp only support LoDTensor, %s has wrong type", iname); + + auto &tensor = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::SerializeToStream(fout, tensor, dev_ctx); + } +}; + +class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor ) Input tensor to be saved"); + AddComment(R"DOC( +Save operator + +This operator will serialize and write a tensor variable to file on disk. +)DOC"); + AddAttr("overwrite", + "(boolean, default true)" + "Overwrite the output file if exist") + .SetDefault(true); + AddAttr("file_path", + "(string)" + "The \"file_path\" where the variable will be saved.") + .AddCustomChecker( + [](const std::string &path) { return !path.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..017fc2c00e4016052179acfe328cdda42d6f84de --- /dev/null +++ b/paddle/fluid/operators/scale_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { + +class ScaleOp : public framework::OperatorWithKernel { + public: + ScaleOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ScaleOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ScaleOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of scale operator."); + AddOutput("Out", "(Tensor) Output tensor of scale operator."); + AddComment(R"DOC( +Scale operator + +$$Out = scale*X$$ +)DOC"); + AddAttr("scale", + "(float, default 1.0)" + "The scaling factor of the scale operator.") + .SetDefault(1.0); + } +}; + +class ScaleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("scale", GetAttr("scale")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, + ops::ScaleGradMaker); +REGISTER_OP_CPU_KERNEL( + scale, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a9b46077aa07406fef2cba5b18d190501ce2f92a --- /dev/null +++ b/paddle/fluid/operators/scale_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scale_op.h" + +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b1c2964ca6385dc6fb81f61a3e5bb042f5d7019f --- /dev/null +++ b/paddle/fluid/operators/scale_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class ScaleKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* tensor = context.Output("Out"); + auto* in = context.Input("X"); + tensor->mutable_data(in->place()); + + auto scale = static_cast(context.Attr("scale")); + + auto eigen_out = framework::EigenVector::Flatten(*tensor); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& dev = + *context.template device_context().eigen_device(); + eigen_out.device(dev) = scale * eigen_in; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..0f1b9426a745ac293bd756da6ee750119879429e --- /dev/null +++ b/paddle/fluid/operators/scatter.cu.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ScatterCUDAKernel(const T* params, const int* indices, + T* output, size_t index_size, + size_t slice_size) { + CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + int scatter_i = indices[indices_i]; + int out_i = scatter_i * slice_size + slice_i; + *(output + out_i) = *(params + i); + } +} + +/** + * A thin wrapper on gpu tensor + * Return a new updated tensor from source tensor, scatter-assigned according to + * index + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + // PADDLE_ENFORCE(platform::is_gpu_place(place)); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + framework::DDim output_dims(src_dims); + output_dims[0] = index_size; + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + int block = 512; + int n = slice_size * index_size; + int grid = (n + block - 1) / block; + + ScatterCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_src, p_index, p_output, index_size, slice_size); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h new file mode 100644 index 0000000000000000000000000000000000000000..70cae1286caf10323e8e424853f1dc14f84b110c --- /dev/null +++ b/paddle/fluid/operators/scatter.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +/** + * Return a updated tensor from source tensor, scattered according to index: + * dst[i] = src[index[i]] + * input[src]: type-T source Tensor + * input[index]: type-int index Tensor (1-D) + * return: output tensor + */ +template +void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + // check index of shape 1-D + PADDLE_ENFORCE(index.dims().size() == 1); + int index_size = index.dims()[0]; + + auto src_dims = src.dims(); + auto dst_dims = output->dims(); + + const T* p_src = src.data(); + const int* p_index = index.data(); + T* p_output = output->data(); + + // check src shape and dst shape should match + for (int i = 1; i < src_dims.size(); i++) + PADDLE_ENFORCE(src_dims[i] == dst_dims[i]); + + // slice size + size_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e35930af53463e18e5ecca3cf41b91ed58a7c4c2 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class ScatterOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ref"), + "Input(Ref) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Index"), + "Input(Index) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Updates"), + "Input(Updates) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ScatterOp should not be null."); + + auto updates_dims = ctx->GetInputDim("Updates"); + auto ref_dims = ctx->GetInputDim("Ref"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, + "Update Index should be 1-D."); + PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), + "Reference and Updates should have the same shape size"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], + ctx->GetInputDim("Index")[0], + "Updates and Index should have same batch-size."); + framework::DDim data_dim(updates_dims); + for (int i = 1; i < data_dim.size(); ++i) { + PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); + } + ctx->SetOutputDim("Out", ref_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Ref")->type()), + ctx.device_context()); + } +}; + +class ScatterGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Updates"), + ctx->GetInputDim("Updates")); + ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Ref")->type()), + ctx.device_context()); + } +}; + +class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ref", "The source input of scatter op"); + AddInput("Index", + "The index input of scatter op where Ref will be updated"); + AddInput("Updates", "The updated value of updates op"); + AddOutput("Out", "The output of add op"); + AddComment(R"DOC( +Scatter Operator. + +This operator obtains output by updating the input on selected indices on the first axis: + +$$ +Out = Ref \\ +Out[Index] = Ref[Index] + Updates +$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad, + ops::ScatterGradOp); +REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel); +REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..f9eaae33a802ed1a45184a24757e3883fad5e639 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.cu @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gather.cu.h" +#include "paddle/fluid/operators/gather_op.h" +#include "scatter.cu.h" + +namespace paddle { +namespace operators { + +template +class ScatterOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *Ref = ctx.Input("Ref"); + auto *Index = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + Out->ShareDataWith(*Ref); + + GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); + } +}; + +template +class ScatterGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Index = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + + // In place gradient: dRef = dO + dRef->ShareDataWith(*dOut); + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates = dO[Index] + GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h new file mode 100644 index 0000000000000000000000000000000000000000..65d10546328780e09bb57876acf2326d98803847 --- /dev/null +++ b/paddle/fluid/operators/scatter_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gather.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ScatterOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + auto *Ref = ctx.Input("Ref"); + auto *Index = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + // In place output: Out = Ref, Out[Index] += Updates + Out->ShareDataWith(*Ref); + // Apply ScatterUpdate: Out[index] += Updates[:] + ScatterAssign(ctx.device_context(), *Updates, *Index, Out); + } +}; + +template +class ScatterGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Index = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + + // In place gradient: dRef = dO + dRef->ShareDataWith(*dOut); + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates += dO[Index] + CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8fb5ef96af34e5bd2dc0802ea76456a8b47749ab --- /dev/null +++ b/paddle/fluid/operators/scatter_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scatter.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#include + +TEST(scatter, ScatterUpdate) { + using namespace paddle::framework; + using namespace paddle::platform; + using namespace paddle::operators; + + Tensor* src = new Tensor(); + Tensor* index = new Tensor(); + Tensor* output = new Tensor(); + + float* p_src = nullptr; + int* p_index = nullptr; + p_src = src->mutable_data(make_ddim({1, 4}), CPUPlace()); + p_index = index->mutable_data(make_ddim({1}), CPUPlace()); + + for (size_t i = 0; i < 4; ++i) p_src[i] = float(i); + p_index[0] = 1; + + float* p_output = output->mutable_data(make_ddim({4, 4}), CPUPlace()); + + auto* cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext ctx(*cpu_place); + ScatterAssign(ctx, *src, *index, output); + + for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0)); + for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data()[i], float(0)); + for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4)); + for (size_t i = 4; i < 8; ++i) + EXPECT_EQ(output->data()[i], float(i - 4)); + for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0)); + for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data()[i], float(0)); + + delete src; + delete index; + delete output; +} diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8390aa6596c69f85e3ef736dda9dd99c3fd6dba --- /dev/null +++ b/paddle/fluid/operators/send_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { + +class SendOp : public framework::OperatorBase { + public: + SendOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope& scope, + const platform::Place& place) const override { + auto ins = Inputs("X"); + auto outs = Outputs("Out"); + std::vector epmap = Attr>("epmap"); + std::vector endpoints = + Attr>("endpoints"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + + for (size_t i = 0; i < ins.size(); i++) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + } + PADDLE_ENFORCE(rpc_client->Wait()); + + for (auto& ep : endpoints) { + VLOG(3) << "batch barrier, ep: " << ep; + rpc_client->AsyncSendBatchBarrier(ep); + } + PADDLE_ENFORCE(rpc_client->Wait()); + + if (outs.size() > 0) { + for (size_t i = 0; i < outs.size(); i++) { + VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; + rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); + } + PADDLE_ENFORCE(rpc_client->Wait()); + } + } +}; + +class SendOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SendOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); + AddOutput("Out", "(Tensor) Output tensor to be received from server") + .AsDuplicable(); + AddOutput("RPCClient", + "(RPCClient) The RPC client object which is" + "initialized at most once."); + AddComment(R"DOC( +Send operator + +This operator will send tensor to recv_op at the parameter server. +)DOC"); + // TODO(typhoonzero): remove this attr generate de-duplicated vector from + // epmap when initializing. + AddAttr>("endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints to send variables to.") + .SetDefault({}); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker); diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..716f687044a85d46676141ee125baf398e9e695d --- /dev/null +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/string/printf.h" + +USE_NO_KERNEL_OP(send); +USE_NO_KERNEL_OP(listen_and_serv); +USE_OP(sum); + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +// global for simplicity. +std::unique_ptr listen_and_serv_op; + +void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { + p::CPUDeviceContext ctx(place); + for (int i = 0; i < 2; ++i) { + auto var_name = paddle::string::Sprintf("x%d", i); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + float *expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + } + + auto out_var = scope.Var("Out"); + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({10, 10}); + out_tensor->mutable_data(place); // allocate +} + +void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { + p::CPUDeviceContext ctx(place); + int64_t height = 10; + int64_t row_numel = 10; + m::SetConstant set_one; + // init x0 + std::vector rows0{0, 4, 7}; + auto x0_var = scope.Var("x0"); + auto x0 = x0_var->GetMutable(); + x0->set_rows(rows0); + x0->set_height(height); + auto x0_value = x0->mutable_value(); + x0_value->mutable_data( + f::make_ddim({static_cast(rows0.size()), row_numel}), place); + set_one(ctx, x0_value, 1.0); + + // init x1 + std::vector rows1{2, 9}; + auto x1_var = scope.Var("x1"); + auto x1 = x1_var->GetMutable(); + x1->set_rows(rows1); + x1->set_height(height); + auto x1_value = x1->mutable_value(); + x1_value->mutable_data( + f::make_ddim({static_cast(rows1.size()), row_numel}), place); + set_one(ctx, x1_value, 1.0); + + auto out_var = scope.Var("Out"); + auto out = out_var->GetMutable(); + auto out_value = out->mutable_value(); + out->set_height(height); + out_value->mutable_data(f::make_ddim({5, 10}), place); +} + +void AddOp(const std::string &type, const f::VariableNameMap &inputs, + const f::VariableNameMap &outputs, f::AttributeMap attrs, + f::BlockDesc *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(f::proto::DataType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +void StartServerNet(bool is_sparse) { + f::Scope scope; + p::CPUPlace place; + if (is_sparse) { + InitSelectedRowsInScope(scope, place); + } else { + InitTensorsInScope(scope, place); + } + + // sub program run in listen_and_serv_op, for simple test we use sum + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + // X for server side tensors, RX for received tensers, must be of same shape. + AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block); + + f::AttributeMap attrs; + attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); + attrs.insert({"ParamList", std::vector({"Out"})}); + attrs.insert({"GradList", std::vector({"x1"})}); + attrs.insert({"OptimizeBlock", block}); + listen_and_serv_op = + f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs); + listen_and_serv_op->Run(scope, place); +} + +TEST(SendRecvOp, CPUDense) { + std::thread server_thread(StartServerNet, false); + sleep(10); // wait server to start + // local net + f::Scope scope; + p::CPUPlace place; + InitTensorsInScope(scope, place); + + f::AttributeMap attrs; + attrs.insert({"endpoints", std::vector({"127.0.0.1:6174"})}); + attrs.insert({"epmap", std::vector({"127.0.0.1:6174"})}); + auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}}, + {{"Out", {"Out"}}}, attrs); + send_op->Run(scope, place); + + auto in_var = scope.Var("x1"); + auto tensor = in_var->GetMutable(); + float *expected = tensor->data(); + auto out_var = scope.Var("Out"); + auto target = out_var->GetMutable(); + // x1 * 2 == x0 + EXPECT_NE(target->memory_size(), size_t(0)); + float *actual = target->data(); + for (int64_t i = 0; i < target->numel(); ++i) { + EXPECT_EQ(expected[i] * 2, actual[i]); + } + listen_and_serv_op->Stop(); + server_thread.join(); + listen_and_serv_op.reset(nullptr); +} + +TEST(SendRecvOp, CPUSparse) { + std::thread server_thread(StartServerNet, true); + sleep(3); // wait server to start + // local net + f::Scope scope; + p::CPUPlace place; + p::CPUDeviceContext ctx(place); + InitSelectedRowsInScope(scope, place); + f::AttributeMap attrs; + attrs.insert({"endpoints", std::vector({"127.0.0.1:6174"})}); + attrs.insert({"epmap", std::vector({"127.0.0.1:6174"})}); + auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}}, + {{"Out", {"Out"}}}, attrs); + send_op->Run(scope, place); + + auto x0 = scope.Var("x0")->GetMutable(); + auto x1 = scope.Var("x1")->GetMutable(); + auto out = scope.Var("Out")->GetMutable(); + auto actual = out->mutable_value(); + + std::unique_ptr expect{new f::SelectedRows()}; + auto expect_value = expect->mutable_value(); + expect_value->mutable_data(f::make_ddim({5, 10}), place); + + m::SelectedRowsAdd add_functor; + add_functor(ctx, *x0, *x1, expect.get()); + + EXPECT_EQ(actual->numel(), expect_value->numel()); + EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size()); + + for (int64_t i = 0; i < expect_value->numel(); ++i) { + EXPECT_EQ(expect_value->mutable_data(place)[i], + actual->mutable_data(place)[i]); + } + listen_and_serv_op->Stop(); + server_thread.join(); + listen_and_serv_op.reset(); +} diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ddf800d85e11aa631255c1b1ec5c12f6e0f221c --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_concat_op.h" + +namespace paddle { +namespace operators { + +class SequenceConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), + "Inputs(X) of SequenceConcatOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceConcatOp should not be null."); + const size_t level = static_cast(ctx->Attrs().Get("level")); + const size_t axis = static_cast(ctx->Attrs().Get("axis")); + PADDLE_ENFORCE(level == 0UL || level == 1UL, + "The sequence_concat operator only accepts sequence " + "or a nested sequence as its input."); + auto ins_dims = ctx->GetInputsDim("X"); + framework::DDim out_dims = ins_dims[0]; + const size_t n = ins_dims.size(); + for (size_t i = 1; i < n; ++i) { + out_dims[axis] += ins_dims[i][axis]; + } + ctx->SetOutputDim("Out", out_dims); + } +}; + +class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LodTensorArray) Input is a vector of LoDTensor, " + "each of which is a variable-length sequence or nested sequence.") + .AsDuplicable(); + AddOutput("Out", + "(LoDTensor), Variable-length output of " + "sequence_concat Op."); + AddAttr("axis", + "(int, default 0) " + "The axis along which the inputs will be joined. " + "If axis is 0, the inputs will be joined with LoD index.") + .SetDefault(0); + AddAttr("level", + "(int, default 0) " + "The level at which the inputs will be joined. " + "If the level is 0, the inputs will be joined at the nested " + "sequence level. " + "If the level is 1, the inputs will be joined at the " + "sequence level. " + "The level should be less than the level number of inputs.") + .SetDefault(0); + AddComment(R"DOC( +The sequence_concat operator concatenates multiple LoDTensors. +It only supports sequence (LoD Tensor with level number is 1) +or a nested sequence (LoD tensor with level number is 2) as its input. +- Case1: + If the axis is other than 0(here, axis is 1 and level is 1), + each input should have the same LoD information and the LoD + information of the output keeps the same as the input. + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4) + LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) + +- Case2: + If the axis is 0(here, leve is 0), the inputs are concatenated along + time steps, the LoD information of the output need to re-compute. + The LoD information of level-1 should be same. + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4) + +- Case3: + If the axis is 0(here, level is 1). + + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4) + +- Case4: + If the LoD number is 1, axis is 0, level is 0 + + LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4) + +NOTE: The levels of all the inputs should be the same. + )DOC"); + } +}; + +class SequenceConcatGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp, + ops::SequenceConcatOpMaker, sequence_concat_grad, + ops::SequenceConcatGradOp, false); +REGISTER_OP_CPU_KERNEL( + sequence_concat, + ops::SequenceConcatOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_concat_grad, + ops::SequenceConcatGradOpKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_concat_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5a280ef9e2515114f5dd6826e55a304066973aa --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_concat_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_concat, + ops::SequenceConcatOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, + ops::SequenceConcatGradOpKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9121196369f1bee20abc56a33b9da8bc4a43f315 --- /dev/null +++ b/paddle/fluid/operators/sequence_concat_op.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +LoD ConcatLoD(const std::vector ins, const size_t level) { + auto out_lod = ins[0]->lod(); + auto numLevels = ins[0]->NumLevels(); + const size_t n = ins.size(); + const size_t level_idx = ins[0]->NumLevels() - 1 - level; + for (size_t i = 1; i < n; ++i) { + for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) { + out_lod[level_idx][j] += ins[i]->lod()[level_idx][j]; + } + } + + for (size_t i = level_idx; i < numLevels - 1; ++i) { + size_t lod_len = 1; + for (size_t j = 0; j < n; ++j) { + lod_len += ins[j]->lod()[i + 1].size() - 1; + } + out_lod[i + 1].clear(); + out_lod[i + 1].resize(lod_len); + + size_t idx = 1; + for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) { + for (size_t k = 0; k < n; ++k) { + for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) { + out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] + + ins[k]->lod()[i + 1][m + 1] - + ins[k]->lod()[i + 1][m]; + idx++; + } + } + } + } + + return out_lod; +} + +template +class SequenceConcatOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + const size_t axis = static_cast(ctx.Attr("axis")); + const size_t level = static_cast(ctx.Attr("level")); + const size_t n = ins.size(); + + for (size_t i = 1; i < n; ++i) { + PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(), + "The levels of all the input LoDTensors " + "should be the same."); + PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(), + "The dimension size of all the input LoDTensors " + "should be the same."); + + const size_t dims_size = ins[i]->dims().size(); + for (size_t j = 0; j < dims_size; ++j) { + if (j == axis) continue; + PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j], + "Except for the dimension of the specified " + "axis along which all the inputs are concatenated, " + "dimensions of all the other axises of the input " + "LoDTensors should be the same."); + } + } + PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level, + "The levels of all the input LoDTensors " + "should be greater than the specify level"); + + out->mutable_data(ctx.GetPlace()); + auto out_lod = ins[0]->lod(); + if (axis == 0) { + out_lod = ConcatLoD(ins, level); + } + out->set_lod(out_lod); + + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; + for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { + Tensor out_t = out->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); + auto out_stride = framework::stride(out_t.dims()); + size_t offset = 0; + for (size_t j = 0; j < n; ++j) { + auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx]; + auto in_stride = framework::stride(ins[j]->dims()); + Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), + static_cast(in_lod_level[i + 1])); + size_t axis_dim = in_t.dims()[axis]; + StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, + in_t.dims(), out_stride, out_t.data() + offset); + offset += axis_dim * in_stride[axis]; + } + } + } +}; + +template +class SequenceConcatGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto x_grads = + ctx.MultiOutput(framework::GradVarName("X")); + size_t axis = static_cast(ctx.Attr("axis")); + size_t level = static_cast(ctx.Attr("level")); + const size_t n = x_grads.size(); + + // Set Grad(X) LoD as X + for (size_t i = 0; i < n; i++) { + x_grads[i]->set_lod(ins[i]->lod()); + x_grads[i]->mutable_data(ctx.GetPlace()); + } + auto out_lod = ins[0]->lod(); + if (axis == 0UL) { + out_lod = ConcatLoD(ins, level); + } + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; + + for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { + Tensor out_grad_t = + out_grad->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); + auto out_grad_stride = framework::stride(out_grad_t.dims()); + size_t offset = 0; + + for (size_t j = 0; j < n; ++j) { + auto x_grad_lod_level = + framework::ToAbsOffset(x_grads[j]->lod())[level_idx]; + auto x_grad_stride = framework::stride(x_grads[j]->dims()); + Tensor x_grad_t = + x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), + static_cast(x_grad_lod_level[i + 1])); + size_t axis_dim = x_grad_t.dims()[axis]; + StridedMemcpy(ctx.device_context(), out_grad_t.data() + offset, + out_grad_stride, out_grad_t.dims(), x_grad_stride, + x_grad_t.data()); + offset += axis_dim * out_grad_stride[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..af9938b18069d65648fbbf0deae31eff088b791f --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_conv_op.h" + +namespace paddle { +namespace operators { + +class SequenceConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of SequenceConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceConvOp should not be null."); + + int context_length = ctx->Attrs().Get("contextLength"); + int context_start = ctx->Attrs().Get("contextStart"); + + auto in_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE(ctx->Attrs().Get("contextStride") == 1, + "Currently, SequenceConvOp only supports contextStride=1."); + PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2, + "Input(X, Filter) should be 2-D tensor."); + PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1], + "Filter's height should be context_length * " + "input_hidden_size ."); + + if (ctx->Attrs().Get("paddingTrainable")) { + PADDLE_ENFORCE( + ctx->HasInput("PaddingData"), + "Input(PaddingData) of SequenceConvOp should not be null."); + framework::DDim padding_dim = ctx->GetInputDim("PaddingData"); + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int total_pad = up_pad + down_pad; + int input_width = static_cast(in_dims[1]); + + if (context_start == 0 && context_length == 1) { + PADDLE_THROW( + "If context_start is 0 and context_length is 1, paddingTrainable " + "should be false."); + } + PADDLE_ENFORCE(padding_dim.size() == 2, + "Input(PaddingData) should be 2-D tensor."); + PADDLE_ENFORCE( + padding_dim[0] == total_pad && padding_dim[1] == input_width, + "Input(PaddingData)'s shape is not consistent with 'context_start' " + "and 'context_length'."); + } + + in_dims[1] = filter_dims[1]; + ctx->SetOutputDim("Out", in_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null."); + + if (ctx->Attrs().Get("paddingTrainable") && + ctx->HasOutput(framework::GradVarName("PaddingData"))) { + ctx->SetOutputDim(framework::GradVarName("PaddingData"), + ctx->GetInputDim("PaddingData")); + } + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), + ctx->GetInputDim("Filter")); + } + } +}; + +class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(LoDTensor) the input(X) is a LodTensor, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T, N), where T is the " + "total time steps in this mini-batch and N is the input_hidden_size."); + AddInput("PaddingData", + "(Tensor, optional) the input(PaddingData) is an optional " + "parameter, and it is learnable. " + "This is a tensor with shape (P, N), where P is the " + "top_pad + bottom_pad, N is the input_hidden_size. In order to " + "ensure the equal length of sequence before and after " + "convolution, it is necessary to fill the top and bottom of each " + "sequence according to context_length, context_stride and " + "context_start") + .AsDispensable(); + AddInput( + "Filter", + "(Tensor) the input(Filter) is an learnable parameter." + "This is a tensor with shape (K, M), where K is the " + "context_length * input_hidden_size, M is the output feature size."); + AddOutput( + "Out", + "(LoDTensor) the output(Out) is a LodTensor, which support " + "variable-time length output sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T, M), where, T is the " + "total time steps in this mini-batch, M is the output feature size."); + + AddAttr("paddingTrainable", + "(bool, default:false) the padding data of SequenceConvOp " + "is trainable or not.") + .SetDefault(false); + AddAttr("contextLength", + "(int) the contextLength of SequenceConvOp is the " + "height of the convolution kernel.") + .GreaterThan(0); + AddAttr("contextStart", + "(int, default:0) the contextStart of SequenceConvOp " + "represents the beginning of the convolution of the number of " + "rows of sequence, which can be negative. The negative number " + "means to pad contextStart time-steps of zeros or learnable " + "parameters at the beginning of each instance. The positive " + "number means to skip contextStart time-steps of each " + "instance.") + .SetDefault(0); + AddAttr("contextStride", + "(int, default:1) the contextStride of SequenceConvOp " + "represents the stride length of convolution kernel. " + "Currently, SequenceConvOp only supports" + "contextStride=1.") + .SetDefault(1) + .GreaterThan(0); + + AddComment(R"DOC( +Sequence Conv Operator. + +SequenceConvOp performs convolution operation on features of contextLength +time-steps of each instance. The convolution operation calculates the output +based on the input, filter, strides and paddings parameters. +The size of each dimension of the parameters is checked during infer-shape. +In order to ensure the equal length of sequence before and after convolution, +it is necessary to fill the top and bottom of each sequence based on +context_length, context_stride and context_start. + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, + sequence_conv_grad, ops::SequenceConvGradOp); + +REGISTER_OP_CPU_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CPU_KERNEL( + sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_conv_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..36f9e8da95d8c963c74fb6c8e75c777b7ba03095 --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_conv_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c81067fea2370458cf6abe8e5465b4c674fbf09 --- /dev/null +++ b/paddle/fluid/operators/sequence_conv_op.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/context_project.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + auto filter = *context.Input("Filter"); + + out->mutable_data(context.GetPlace()); + context.ShareLoD("X", "Out"); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ(in->lod().size(), 1UL, + "Only support one level sequence now."); + + const Tensor* padding_data = nullptr; + if (padding_trainable) { + padding_data = context.Input("PaddingData"); + } + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int sequence_width = static_cast(in->dims()[1]); + + framework::DDim col_shape = {in->dims()[0], + context_length * sequence_width}; + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // Because if padding_trainable is false, padding data should be zeros. + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, &col, static_cast(0)); + + math::ContextProjectFunctor seq_project_functor; + + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); + + math::matmul(dev_ctx, col, false, filter, false, + static_cast(1.0), out, + static_cast(0.0)); + } +}; + +template +class SequenceConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in_g = context.Output(framework::GradVarName("X")); + auto* out_g = context.Input(framework::GradVarName("Out")); + auto* filter_g = context.Output(framework::GradVarName("Filter")); + auto* padding_data_g = + context.Output(framework::GradVarName("PaddingData")); + auto* in = context.Input("X"); + auto* filter = context.Input("Filter"); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ(in->lod().size(), 1UL, + "Only support one level sequence now."); + auto lod_g_level_0 = in->lod()[0]; + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + int sequence_width = static_cast(in->dims()[1]); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + // use col_shape in the im2col calculation + framework::DDim col_shape = {in->dims()[0], + sequence_width * context_length}; + Tensor col; + + if (in_g || filter_g || (padding_trainable && padding_data_g)) { + col.mutable_data(col_shape, context.GetPlace()); + // Because if padding_trainable is false, padding data should be zeros. + set_zero(dev_ctx, &col, static_cast(0)); + math::matmul(dev_ctx, *out_g, false, *filter, true, + T(1.0), &col, T(1.0)); + } + math::ContextProjectFunctor seq_project_functor; + math::ContextProjectGradFunctor seq_project_grad_functor; + + if (in_g) { + in_g->mutable_data(context.GetPlace()); + in_g->set_lod(in->lod()); + set_zero(dev_ctx, in_g, static_cast(0)); + + seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start, + context_length, context_stride, up_pad, down_pad, + false, true, padding_data_g, &col); + } + + if (padding_trainable && padding_data_g) { + padding_data_g->mutable_data(context.GetPlace()); + set_zero(dev_ctx, padding_data_g, static_cast(0)); + + LoDTensor* input = const_cast(in); + seq_project_grad_functor( + dev_ctx, *input, padding_trainable, context_start, context_length, + context_stride, up_pad, down_pad, true, false, padding_data_g, &col); + } + + if (filter_g) { + filter_g->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_g, static_cast(0)); + + Tensor filter_grad = *filter_g; + LoDTensor out_grad = *out_g; + + const Tensor* padding_data = nullptr; + if (padding_trainable) { + padding_data = context.Input("PaddingData"); + } + + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); + + math::matmul(dev_ctx, col, true, out_grad, false, + T(1.0), &filter_grad, T(1.0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e0adf8b1900f7b7c43001459a7e7c494d854274 --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_erase_op.h" + +namespace paddle { +namespace operators { + +class SequenceEraseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceEraseOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceEraseOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1, + "Input(X) of SequenceEraseOp should be a 2-D LoDTensor " + "with the 2nd dimension equal to 1."); + ctx->SetOutputDim("Out", x_dims); + } +}; + +class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(2-D LoDTensor with the 2nd dim. equal to 1) " + "Input LoDTensor of SequenceEraseOp."); + AddOutput("Out", + "(2-D LoDTensor with the 2nd dim. equal to 1) " + "Output LoDTensor of SequenceEraseOp."); + AddAttr>("tokens", + "(vector) Tokens need to be erased from " + "input sequences."); + AddComment(R"DOC( +Sequence Erase Operator. + +Sequence erase operator erases tokens specified by Attr(tokens) from the input +sequences Input(X), and outputs the remaining data and modifies the LoD +information at the same time. For example, given a 2-D LoDTensor + + X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T + +with lod = [[0, 3, 6, 10]], there are three sequences in the input: + + X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T. + +If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing +operation, the three sequences become + + X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T. + +Hence the LoDTensor Output(Out) should be + + Out = [[6, 1, 9, 6, 1, 0, 1]]^T, + +with lod = [[0, 1, 3, 7]]. + +An example usage for this operator is to remove the special tokens when +computing the edit distance between two strings, such as blank, start token, +and end token. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp, + ops::SequenceEraseOpMaker); +REGISTER_OP_CPU_KERNEL( + sequence_erase, + ops::SequenceEraseKernel, + ops::SequenceEraseKernel); diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..43fc352fe78d03fb54dd90a43e3d37b0646cefce --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.cu @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/operators/sequence_erase_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; +using LoDTensor = framework::LoDTensor; + +template +__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len, + const int* tokens, const size_t tokens_len, + size_t* num_erased) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < in_len) { + for (size_t i = 0; i < tokens_len; ++i) { + if (in_dat[index] == tokens[i]) { + num_erased[index + 1] = 1; + break; + } + } + } +} + +__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod, + const size_t lod_len, size_t* out_lod0) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < lod_len) { + out_lod0[index] = in_lod[index] - num_erased[in_lod[index]]; + } +} + +template +__global__ void SetOutput(const T* in_dat, const int64_t in_len, + const size_t* num_erased, T* out_dat) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < in_len) { + if (num_erased[index] == num_erased[index + 1]) { + out_dat[index - num_erased[index]] = in_dat[index]; + } + } +} + +template +class SequenceEraseOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + "The actual size mismatches with the LoD information."); + auto tokens = ctx.Attr>("tokens"); + auto in_len = in->numel(); + auto in_dat = in->data(); + // Copy tokens to GPU + thrust::device_vector dev_tokens(tokens.begin(), tokens.end()); + int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data()); + + // Count number of elements to be erased + thrust::device_vector num_erased(in_len + 1, 0); + size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data()); + auto stream = ctx.cuda_device_context().stream(); + LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr); + thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(), + num_erased.begin() + 1); + + // Copy LoD to GPU + auto lod0 = lod[0]; + auto lod_len = lod0.size(); + const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace()); + + // Calc output LoD + thrust::device_vector dev_out_lod(lod_len); + size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); + GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); + // Set LoD for output + std::vector out_lod0(dev_out_lod.begin(), dev_out_lod.end()); + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out->set_lod(out_lod); + + // Set output + out->Resize({static_cast(out_lod0.back()), 1}); + auto out_dat = out->mutable_data(ctx.GetPlace()); + SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len, + num_erased_ptr, out_dat); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(sequence_erase, + paddle::operators::SequenceEraseOpCUDAKernel, + paddle::operators::SequenceEraseOpCUDAKernel); diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e151279c7fc20d5e04048080a9432cb723334b75 --- /dev/null +++ b/paddle/fluid/operators/sequence_erase_op.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class SequenceEraseKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(), + "The actual size mismatches with the LoD information."); + auto tokens = ctx.Attr>("tokens"); + auto in_len = in->numel(); + auto in_dat = in->data(); + auto lod0 = lod[0]; + + std::vector num_erased(in_len + 1, 0); + std::vector out_lod0(1, 0); + for (size_t i = 0; i < lod0.size() - 1; ++i) { + size_t num_out = 0; + for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) { + num_erased[j] = num_erased[j - 1]; + if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) != + tokens.end()) { + num_erased[j] += 1; + } else { + num_out += 1; + } + } + out_lod0.push_back(out_lod0.back() + num_out); + } + + auto out_len = in_len - num_erased[in_len]; + out->Resize({static_cast(out_len), 1}); + auto out_dat = out->mutable_data(ctx.GetPlace()); + + for (int64_t i = 0; i < in_len; ++i) { + if (num_erased[i] == num_erased[i + 1]) { + out_dat[i - num_erased[i]] = in_dat[i]; + } + } + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out->set_lod(out_lod); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ebce641d2876a4f2329b6d7d7263a6b2a31fcf6 --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_expand_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SequenceExpandOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasOutput("Out")); + PADDLE_ENFORCE(ctx->HasInput("Y")); + framework::DDim out_dim; + out_dim = ctx->GetInputDim("Y"); + ctx->ShareLoD("Y", "Out"); + ctx->SetOutputDim("Out", out_dim); + } +}; + +class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor or LoDTensor) The input(X) of this operator can be a " + "LoDTensor or a base Tensor."); + AddInput("Y", + "(LoDTensor)The reference input(Y) of sequence_expand op." + "It must be a LoDTensor with k-level(k>0)." + "The input(X) will be expanded according to LOD of input(Y)." + "The element numbers of last level in input(Y) " + "must be equal to dims[0] of input(X)."); + AddOutput("Out", + "(LodTensor)The output of sequence_expand op." + "The lod of output will be as same as input(Y)'s lod."); + AddComment(R"DOC( +Sequence Expand Operator. + +This operator expands input(X) according to LOD of input(Y). +Following are cases to better explain how this works: +Case 1: + +Given a 2-level LoDTensor input(X) + X.lod = [[0, 2, 3], + [0, 1, 3, 4]] + X.data = [a, b, c, d] + X.dims = [4, 1] +and input(Y) + Y.lod = [[0, 2, 4], + [0, 3, 6, 7, 8]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 2-level LoDTensor + Out.lod = [[0, 2, 4], + [0, 3, 6, 7, 8]] + Out.data = [a, a, a, b, b, b, c, d] + Out.dims = [8, 1] + +Case 2: + +Given a common Tensor input(X) + X.data = [a, b, c] + X.dims = [3, 1] +and input(Y) + Y.lod = [[0, 2, 3, 6]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 1-level LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [a, a, b, c, c, c] + Out.dims = [6, 1] + +Case 3: + +Given a common Tensor input(X) + X.data = [[a, b], [c, d], [e, f]] + X.dims = [3, 2] +and input(Y) + Y.lod = [[0, 2, 3, 6]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 1-level LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]] + Out.dims = [6, 2] + +Case 4: + +Given 2-level a LoDTensor input(X) + X.lod = [[0, 2, 3], + [0, 1, 3, 4]] + X.data = [a, b, c, d] + X.dims = [4, 1] +and input(Y) + Y.lod = [[0, 2, 4], + [0, 3, 6, 6, 8]] +with condition len(Y.lod[-1]) -1 == X.dims[0] +then we get 2-level LoDTensor + Out.lod = [[0, 2, 4], + [0, 3, 6, 6, 8]] + Out.data = [a, a, a, b, b, b, d, d] + Out.dims = [8, 1] + + +)DOC"); + } +}; + +class SequenceExpandOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Out")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker, + sequence_expand_grad, ops::SequenceExpandOpGrad); +REGISTER_OP_CPU_KERNEL( + sequence_expand, + ops::SequenceExpandKernel); +REGISTER_OP_CPU_KERNEL( + sequence_expand_grad, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ac76d83da618680502d0add51ae68ac117ad2aa --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sequence_expand_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_expand, + ops::SequenceExpandKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_expand_grad, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8010627ff6f5acbf300b0f3f9281e60b4ebfa94e --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +class SequenceExpandKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + const T* x_data = x->data(); + auto x_dims = x->dims(); + auto* y = context.Input("Y"); + PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); + PADDLE_ENFORCE_EQ(static_cast(x_dims[0]), + y->lod().back().size() - 1, + "The size of last lod level in Input(Y)" + "must be equal to dims[0] of Input(X)."); + out->set_lod(y->lod()); + auto* place = + context.template device_context().eigen_device(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + for (size_t i = 0; i < out_starts.size() - 1; i++) { + int scale = out_starts[i + 1] - out_starts[i]; + Eigen::TensorMap< + Eigen::Tensor> + x_t(x_data, 1, element_len); + Eigen::TensorMap> + out_t(out_data, scale, element_len); + Eigen::array cast({{scale, 1}}); + out_t.device(*place) = x_t.broadcast(cast); + x_data += element_len; + out_data += element_len * scale; + } + } +}; + +/* + *Given Grad(Out) + * + * Grad(Out).lod = [[0, 2], + * [0, 3, 6]] + * Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] + * Then + * Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)] + * = [0.6, 1.5] + * Grad(X).lod = Input(X).lod + * + * */ +template +class SequenceExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* x = context.Input("X"); + auto* out = context.Input("Out"); + auto* d_x = context.Output(framework::GradVarName("X")); + auto out_last_level = out->lod().back(); + d_x->set_lod(x->lod()); + const T* d_out_data = d_out->data(); + T* d_x_data = d_x->mutable_data(context.GetPlace()); + size_t element_len = d_out->numel() / d_out->dims()[0]; + for (size_t i = 0; i < out_last_level.size() - 1; ++i) { + size_t repeat = out_last_level[i + 1] - out_last_level[i]; + Eigen::TensorMap< + Eigen::Tensor> + d_out_t(d_out_data, static_cast(repeat), element_len); + Eigen::TensorMap> + d_x_t(d_x_data, static_cast(element_len)); + auto place = + context.template device_context().eigen_device(); + d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); + d_out_data += (repeat * element_len); + d_x_data += element_len; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cfb336b2e0b31ab20182a36d806506b6af4c139 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pool_op.h" + +namespace paddle { +namespace operators { + +class SequencePoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePoolOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + if (ctx->Attrs().Get("pooltype") == "MAX") { + PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"), + "Output(MaxIndex) of SequencePoolOp should not be null."); + ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X")); + } + } +}; + +class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp"); + AddOutput("Out", + "(Tensor) The output of SequencePoolOp does not contain LoD " + "infomation."); + AddOutput("MaxIndex", + "(Tensor) This tensor is used for the sequence max-pooling " + "to record the max indexes.") + .AsIntermediate(); + AddAttr( + "pooltype", + "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") + .SetDefault("AVERAGE") + .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}); + AddComment(R"DOC( +Sequence Pool Operator. + +The SequencePoolOp pools features of all time-steps of each instance. +It supports six pooling types: +1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$ +2. SUM: $$Out[i] = \sum_jX_{ij}$$ +3. SQRT: $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$ +4. LAST: Out[i] = last instance in i-th sequence X[i] +5. FIRST: Out[i] = first instance in i-th sequence X[i] +6. MAX: $$Out[i] = max(X_i)$$ + +The following example explains how this works: +For a mini-batch of 3 variable-length sentences, +containing 2, 3, and 2 time-steps: + +Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2. +Besides, for the sake of simplicity, we assume M=1 and N=1, +and the value of X = [[1, 3], [2, 4, 6], [5, 1]]. + +Thus, Out is a [3,1,1] Tensor without LoD infomation. +And for different pooltype, the value of Out is as follows: + +- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 +- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1 +- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), + 6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2) +- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1) +- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) +- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) + + )DOC"); + } +}; + +class SequencePoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null."); + auto og_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(), + "The rank of output grad must equal to Input(X)."); + for (int64_t i = 1; i < og_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch."); + } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_pool_grad"); + op_desc_ptr->SetInput("X", Input("X")); + if (boost::get(GetAttr("pooltype")) == "MAX") { + op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex")); + } + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, + ops::SequencePoolGradOpMaker); +REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CPU_KERNEL( + sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_pool_op.cu b/paddle/fluid/operators/sequence_pool_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..364769c39bd1b94935630eb8c16c0e27787139e1 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/sequence_pool_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7b67e6201ebb04b3fbda3520347c580fd9501098 --- /dev/null +++ b/paddle/fluid/operators/sequence_pool_op.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SequencePoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + std::string pooltype = context.Attr("pooltype"); + + auto dims = in->dims(); + auto lod = in->lod(); + int64_t w = in->numel() / dims[0]; + + // InferShape by lod + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_GE( + dims[0], + /*batch size = */ static_cast(lod[0].size() - 1), + "The first dimension of Input(X) must be large than batch size."); + dims[0] = lod[0].size() - 1; + out->Resize({dims}); + + auto lod_level_0 = lod[0]; + + out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + if (pooltype == "MAX") { + math::MaxSeqPoolFunctor max_pool; + auto* index = context.Output("MaxIndex"); + index->Resize({dims}); + index->mutable_data(context.GetPlace()); + max_pool(dev_ctx, *in, out, index); + return; + } + + auto& place = + *context.template device_context().eigen_device(); + for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { + Tensor in_t = in->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + Tensor out_t = out->Slice(i, i + 1); + int64_t h = static_cast(lod_level_0[i + 1] - lod_level_0[i]); + auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); + auto out_e = EigenVector::Flatten(out_t); + + if (pooltype == "AVERAGE") { + out_e.device(place) = in_e.mean(Eigen::array({{0}})); + } else if (pooltype == "SUM") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})); + } else if (pooltype == "SQRT") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})) / + std::sqrt(static_cast(h)); + } else if (pooltype == "LAST") { + out_e.device(place) = in_e.chip(h - 1, 0); + } else if (pooltype == "FIRST") { + out_e.device(place) = in_e.chip(0, 0); + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +template +class SequencePoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out_g = context.Input(framework::GradVarName("Out")); + auto* in_g = context.Output(framework::GradVarName("X")); + std::string pooltype = context.Attr("pooltype"); + + auto dims = in->dims(); + auto lod = in->lod()[0]; + int64_t w = in->numel() / dims[0]; + + in_g->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + if (pooltype == "MAX") { + math::MaxSeqPoolGradFunctor max_pool_grad; + auto* index = context.Input("MaxIndex"); + max_pool_grad(dev_ctx, *out_g, *index, in_g); + return; + } + + if (pooltype == "LAST" || pooltype == "FIRST") { + // set X@Grad be zero at first when pooltype is LAST/FIRST + math::SetConstant functor; + functor(dev_ctx, in_g, 0); + } + auto& place = + *context.template device_context().eigen_device(); + + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + auto in_g_t = + in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); + auto out_g_t = out_g->Slice(i, i + 1); + int64_t h = static_cast(lod[i + 1] - lod[i]); + auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); + auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); + auto out_g_e_v = EigenVector::Flatten(out_g_t); + Eigen::DSizes bcast(h, 1); + + if (pooltype == "AVERAGE") { + in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); + } else if (pooltype == "SUM") { + in_g_e.device(place) = (out_g_e).broadcast(bcast); + } else if (pooltype == "SQRT") { + in_g_e.device(place) = + (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); + } else if (pooltype == "LAST") { + in_g_e.chip(h - 1, 0).device(place) = out_g_e_v; + } else if (pooltype == "FIRST") { + in_g_e.chip(0, 0).device(place) = out_g_e_v; + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4e42d3eeb5555be693946ccde30ef87f88d0f32 --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_reshape_op.h" +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class SequenceReshapeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceReshapeOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_numel = product(x_dims); + PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2."); + int new_dim = ctx->Attrs().Get("new_dim"); + if (ctx->IsRuntime()) { + ctx->SetOutputDim("Out", + {x_numel / new_dim, static_cast(new_dim)}); + } else { + // when compiling, the batch size is undetermined, just set to -1 + ctx->SetOutputDim("Out", {-1, static_cast(new_dim)}); + } + } +}; + +class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, default LoDTensor) A 2-D LoDTensor with shape " + "being [N, M]."); + AddOutput("Out", + "(LoDTensor, default LoDTensor) A 2-D LoDTensor with " + "shape [T, new_dim] where T is calculated based on X.lod, M and " + "new_dim."); + AddAttr("new_dim", "Sequence dimension of the output LoDTensor."); + AddComment(R"DOC( +Sequence Reshape Operator. + +This operator will rearrange the input sequences. The new dimension is set by +attribute and length of each sequence may change longer or shorter which is +decided by original length, original dimension and new dimension. The following +example will help to illustrate the function of this operator: + +x is a LoDTensor: + x.lod = [[0, 2, 6]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], [9, 10], [11, 12]] + x.dims = [6, 2] + +set new_dim = 4 + +then out is a LoDTensor: + out.lod = [[0, 1, 3]] + out.data = [[1, 2, 3, 4], + [5, 6, 7, 8], [9, 10, 11, 12]] + out.dims = [3, 4] + +Currently, only 1-level LoDTensor is supported and please make sure (original +length * original dimension) can be divided by new_dim with no remainder for +each sequence. + +)DOC"); + } +}; + +class SequenceReshapeGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceReshapeGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceReshapeGradOp should not be null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } +}; + +class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_reshape_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp, + ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker); +REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); +REGISTER_OP_CPU_KERNEL( + sequence_reshape_grad, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_reshape_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ca3497396eaa6c811c69af4acf4fa3092cff42a --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_reshape_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_reshape_grad, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_reshape_op.h b/paddle/fluid/operators/sequence_reshape_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7a5d1261da917c6e596ff7b85afbfd95ff90f12a --- /dev/null +++ b/paddle/fluid/operators/sequence_reshape_op.h @@ -0,0 +1,86 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +template +class SequenceReshapeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int out_width = context.Attr("new_dim"); + + auto in_dims = in->dims(); + int64_t in_width = in_dims[1]; + auto& in_lod = in->lod(); + + PADDLE_ENFORCE_EQ(in_lod.size(), 1UL, + "Only support one level sequence now."); + PADDLE_ENFORCE_EQ( + (uint64_t)in_dims[0], in_lod[0].back(), + "Inconsistent size between X.shape[0] and X.lod()[0].back()."); + + auto in_lod_l0 = in_lod[0]; + int seq_num = in_lod_l0.size() - 1; + + if (in_width == out_width) { + out->set_lod(in->lod()); + } else { + auto& out_lod = *out->mutable_lod(); + out_lod.resize(1); + out_lod[0].resize(seq_num + 1); + out_lod[0][0] = 0; + for (int i = 0; i < seq_num; ++i) { + size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i]; + size_t offset = 0; + offset = (seq_len * in_width) / out_width; + PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width, + "Please make sure (sequence_length * dimension) can " + "be divided by new_dim with no remainder for each " + "sequence. The %dth sequence is invalid.", + i + 1); + out_lod[0][i + 1] = out_lod[0][i] + offset; + } + } + + framework::Copy(*in, context.GetPlace(), out); + out->Resize({static_cast(out->lod()[0].back()), out_width}); + } +}; + +template +class SequenceReshapeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x_tensor_ptr = context.Input("X"); + auto* outg_tensor_ptr = + context.Input(framework::GradVarName("Out")); + auto* xg_tensor_ptr = + context.Output(framework::GradVarName("X")); + + xg_tensor_ptr->mutable_data(context.GetPlace()); + framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr); + xg_tensor_ptr->Resize(x_tensor_ptr->dims()); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..87b8eff64621290ebd75d2cb76d7c684655b884f --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_slice_op.h" + +namespace paddle { +namespace operators { + +class SequenceSliceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Offset"), + "Input(Offset) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Length"), + "Input(Length) of SequenceSliceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceSliceOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + + auto offset_dim = ctx->GetInputDim("Offset"); + auto length_dim = ctx->GetInputDim("Length"); + + PADDLE_ENFORCE_EQ( + offset_dim.size(), 2UL, + "Only support one level sequence now, The rank of offset must be 2."); + PADDLE_ENFORCE_EQ( + length_dim.size(), 2UL, + "Only support one level sequence now, The rank of Length must be 2."); + + // Initialize the output's dims to maximum, + // and re-set to real dims by the value of Offset and Length at kernel + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequenceSliceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), " + "the input of SequenceSliceOp."); + AddInput("Offset", + "(Tensor), " + "a vector to describe the offset of every input sequence for " + "sub sequence item."); + AddInput("Length", + "(Tensor), " + "a vector to describe the length of every input sequence for " + "sub sequence item."); + AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp."); + AddComment(R"DOC( +Sequence slice operator + +The operator crops a subsequence from given sequence with given start offset and subsequence length. +It only supports sequence (LoD Tensor with level number is 1). +- Case: + X = [[a1, a2; + b1, b2; + c1, c2] + [d1, d2; + e1, e2]] + LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2) + Offset = [[0], [1]]; Length = [[2], [1]] + + Out = [[a1, a2; + b1, b2] + [e1, e2]] + LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2) +NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker, + sequence_slice_grad, ops::SequenceSliceGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_slice, + ops::SequenceSliceOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_slice_grad, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu new file mode 100755 index 0000000000000000000000000000000000000000..041fabdf9a2dc73540ab45e5e86aa1ef71bed4dc --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_slice_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_slice, + ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_slice_grad, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h new file mode 100644 index 0000000000000000000000000000000000000000..65c36a32aa12c628db5c4f0c104d3977e625ad97 --- /dev/null +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, + const int64_t* length_data) { + auto out_lod = in.lod(); + size_t lod_offset = 0; + + auto n = in.lod()[0].size() - 1; + out_lod[0][0] = 0; + for (size_t i = 0; i < n; ++i) { + lod_offset += length_data[i]; + out_lod[0][i + 1] = lod_offset; + } + return out_lod; +} + +template +class SequenceSliceOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* offset = ctx.Input("Offset"); + auto* length = ctx.Input("Length"); + auto* out = ctx.Output("Out"); + + auto lod = in->lod(); + auto n = lod[0].size() - 1; + + PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_EQ( + n, static_cast(length->dims()[0]), + "The size of input-sequence and length-array should be the same"); + PADDLE_ENFORCE_EQ( + n, static_cast(offset->dims()[0]), + "The size of input-sequence and offset-array should be the same"); + + const int64_t* offset_data = offset->data(); + const int64_t* length_data = length->data(); + framework::Tensor offset_cpu; + framework::Tensor length_cpu; + + if (platform::is_gpu_place(ctx.GetPlace())) { + offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); + framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); + offset_data = offset_cpu.data(); + + length_cpu.mutable_data(length->dims(), platform::CPUPlace()); + framework::Copy(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); + length_data = length_cpu.data(); + } + + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_LT(0, offset_data[i], + "The offset[%d] must greater than zero.", i); + PADDLE_ENFORCE_LT(0, length_data[i], + "The length[%d] must greater than zero.", i); + PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + lod[0][i + 1], "The target tensor's length overflow."); + } + + out->mutable_data(ctx.GetPlace()); + auto out_lod = SequenceSliceLoD(*in, offset_data, length_data); + auto out_dims = in->dims(); + out_dims[0] = out_lod[0][out_lod[0].size() - 1]; + out->Resize(out_dims); + out->set_lod(out_lod); + + auto in_stride = framework::stride(in->dims()); + auto out_stride = framework::stride(out->dims()); + + size_t out_offset = 0; + for (size_t i = 0; i < n; ++i) { + Tensor in_t = in->Slice( + static_cast(lod[0][i] + offset_data[i]), + static_cast(lod[0][i] + offset_data[i] + length_data[i])); + + StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, + in_t.dims(), out_stride, out->data() + out_offset); + out_offset += length_data[i] * in_stride[0]; + } + } +}; + +template +class SequenceSliceGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* offset = ctx.Input("Offset"); + auto* length = ctx.Input("Length"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + + const int64_t* offset_data = offset->data(); + const int64_t* length_data = length->data(); + framework::Tensor offset_cpu; + framework::Tensor length_cpu; + + if (platform::is_gpu_place(ctx.GetPlace())) { + offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); + framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(), + &offset_cpu); + offset_data = offset_cpu.data(); + + length_cpu.mutable_data(length->dims(), platform::CPUPlace()); + framework::Copy(*length, platform::CPUPlace(), ctx.device_context(), + &length_cpu); + length_data = length_cpu.data(); + } + + auto lod = in->lod(); + auto out_lod = out_grad->lod(); + + if (x_grad) { + x_grad->mutable_data(ctx.GetPlace()); + x_grad->set_lod(in->lod()); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), x_grad, + static_cast(0)); + + auto out_grad_stride = framework::stride(out_grad->dims()); + + for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { + Tensor out_grad_t = + out_grad->Slice(static_cast(out_lod[0][i]), + static_cast(out_lod[0][i + 1])); + auto out_grad_stride = framework::stride(out_grad_t.dims()); + + auto x_grad_stride = framework::stride(x_grad->dims()); + + Tensor x_grad_t = x_grad->Slice( + static_cast(lod[0][i] + offset_data[i]), + static_cast(lod[0][i] + offset_data[i] + length_data[i])); + + StridedMemcpy(ctx.device_context(), out_grad_t.data(), + out_grad_stride, out_grad_t.dims(), x_grad_stride, + x_grad_t.data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f966b7162077943dd78d601743b3a3e2e103444b --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_softmax_op.h" + +namespace paddle { +namespace operators { + +class SequenceSoftmaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceSoftmaxOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension " + "of length 1."); + AddOutput("Out", + "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension " + "of length 1."); + AddComment(R"DOC( +Sequence Softmax Operator. + +SequenceSoftmaxOp computes the softmax activation among all time-steps for each +sequence. The dimension of each time-step should be 1. Thus, the shape of +input Tensor can be either [N, 1] or [N], where N is the sum of the length +of all sequences. + +The algorithm works as follows: + + for i-th sequence in a mini-batch: + +$$ +Out(X[lod[i]:lod[i+1]], :) = \ +\frac{\exp(X[lod[i]:lod[i+1], :])} \ +{\sum(\exp(X[lod[i]:lod[i+1], :]))} +$$ + +For example, for a mini-batch of 3 sequences with variable-length, +each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7], +then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :] +and N turns out to be 7. + +)DOC"); + } +}; + +class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), + "Input(Out) of SequenceSoftmaxGradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceSoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) of SequenceSoftmaxOp should not be null."); + + PADDLE_ENFORCE_EQ( + ctx->GetInputDim("Out"), + ctx->GetInputDim(framework::GradVarName("Out")), + "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of " + "the same shape."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, + ops::SequenceSoftmaxOpMaker, sequence_softmax_grad, + ops::SequenceSoftmaxGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_softmax, + ops::SequenceSoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c42dfd7540954616eb7bf012160a98211c3caf1b --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_softmax_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_softmax, + ops::SequenceSoftmaxKernel) +REGISTER_OP_CUDA_KERNEL( + sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e6c21c67b3362835b2ff87045a213b2636556346 --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceSoftmaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = x->lod(); + auto dims = x->dims(); + + const size_t level = lod.size() - 1; + PADDLE_ENFORCE_EQ(dims[0], static_cast(lod[level].back()), + "The first dimension of Input(X) should be equal to the " + "sum of all sequences' lengths."); + PADDLE_ENFORCE_EQ(dims[0], x->numel(), + "The width of each timestep in Input(X) of " + "SequenceSoftmaxOp should be 1."); + + out->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); + x_i.Resize(dims_i); + out_i.Resize(dims_i); + math::SoftmaxFunctor()( + ctx.template device_context(), &x_i, &out_i); + } + } +}; + +template +class SequenceSoftmaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + + auto lod = x->lod(); + const size_t level = lod.size() - 1; + + x_grad->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); + out_i.Resize(dims_i); + out_grad_i.Resize(dims_i); + x_grad_i.Resize(dims_i); + math::SoftmaxGradFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, + &x_grad_i); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1e23a62f4ec52b40cfa1febc98fbfb045f45efd --- /dev/null +++ b/paddle/fluid/operators/sgd_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sgd_op.h" + +namespace paddle { +namespace operators { + +class SGDOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of SGDOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of SGDOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 element"); + auto param_dim = ctx->GetInputDim("Param"); + // TODO(qijun): check dimensions of Param and Grad at complie + // and run time. + ctx->SetOutputDim("ParamOut", param_dim); + } +}; + +class SGDOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("LearningRate", "(Tensor) Learning rate of SGD"); + AddInput("Grad", "(Tensor) Input gradient"); + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddComment(R"DOC( + +SGD operator + +This operator implements one step of the stochastic gradient descent algorithm. + +$$param\_out = param - learning\_rate * grad$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, ops::SGDOpKernel); diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..09374e20494be2eebba913bd90a7c32e1aa0015b --- /dev/null +++ b/paddle/fluid/operators/sgd_op.cu @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sgd_op.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, + const int num, T* p_out) { + T lr = learning_rate[0]; + int grid_size = blockDim.x * gridDim.x; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) { + T g_data = g[i]; + T p_data = p[i]; + p_out[i] = p_data - lr * g_data; + } +} + +template +__global__ void SparseSGDFunctorKernel(const T* selected_rows, + const int64_t* rows, + const T* learning_rate, T* tensor_out, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + } +} +} // namespace + +template +class SGDOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param = ctx.Input("Param"); + auto* param_out = ctx.Output("ParamOut"); + auto* learning_rate = ctx.Input("LearningRate"); + + auto* grad_var = ctx.InputVar("Grad"); + // Actually, all tensors are LoDTensor except SelectedRows. + if (grad_var->IsType()) { + param_out->mutable_data(ctx.GetPlace()); + auto* grad = ctx.Input("Grad"); + auto* grad_data = grad->data(); + auto* param_data = param->data(); + auto* param_out_data = param_out->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + + SGDKernel<<>>( + grad_data, param_data, learning_rate->data(), param->numel(), + param_out_data); + + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out); + auto* grad = ctx.Input("Grad"); + + auto in_height = grad->height(); + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ(in_height, out_dims[0]); + + auto& in_value = grad->value(); + framework::Vector in_rows(grad->rows()); + + int64_t in_row_numel = in_value.numel() / in_rows.size(); + PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); + + auto* in_data = in_value.data(); + auto* out_data = param_out->data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in_rows.size()); + SparseSGDFunctorKernel< + T, 256><<>>( + in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), + out_data, in_row_numel); + + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel, + ops::SGDOpCUDAKernel); diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f1eaaecdb1eef1b42ea5d3b7315133c665b50df6 --- /dev/null +++ b/paddle/fluid/operators/sgd_op.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace operators { + +template +class SGDOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* param = ctx.Input("Param"); + auto* param_out = ctx.Output("ParamOut"); + auto* learning_rate = ctx.Input("LearningRate"); + + auto* grad_var = ctx.InputVar("Grad"); + // Actually, all tensors are LoDTensor except SelectedRows. + if (grad_var->IsType()) { + param_out->mutable_data(ctx.GetPlace()); + auto* grad = ctx.Input("Grad"); + + auto p = framework::EigenVector::Flatten(*param); + auto g = framework::EigenVector::Flatten(*grad); + auto o = framework::EigenVector::Flatten(*param_out); + auto* lr = learning_rate->data(); + + o = p - lr[0] * g; + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out); + auto* grad = ctx.Input("Grad"); + + auto in_height = grad->height(); + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ(in_height, out_dims[0]); + + auto& in_value = grad->value(); + auto& in_rows = grad->rows(); + + int64_t in_row_numel = in_value.numel() / in_rows.size(); + PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); + + auto* in_data = in_value.data(); + auto* out_data = param_out->data(); + auto* lr = learning_rate->data(); + + for (size_t i = 0; i < in_rows.size(); i++) { + for (int64_t j = 0; j < in_row_numel; j++) { + out_data[in_rows[i] * in_row_numel + j] -= + lr[0] * in_data[i * in_row_numel + j]; + } + } + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..df50a324fde1637f1f9f64a0b0d4eff8ba3f26d2 --- /dev/null +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/array_operator.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +class ShrinkRNNMemoryOp : public ArrayOp { + public: + ShrinkRNNMemoryOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x_var = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x_var != nullptr, "Input X must be set"); + auto &x_tensor = x_var->Get(); + size_t offset = this->GetOffset(scope, place); + auto *rank_table_var = scope.FindVar(Input("RankTable")); + PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set"); + auto &rank_table = rank_table_var->Get(); + + auto &rank_items = rank_table.items(); + int dst_num_rows = + std::lower_bound(rank_items.begin(), rank_items.end(), offset, + [](const framework::LoDRankTable::TableItem &a, + size_t b) { return a.length > b; }) - + rank_items.begin(); + + auto *out_var = scope.FindVar(Output("Out")); + PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set."); + auto &out_tensor = *out_var->GetMutable(); + + size_t height = dst_num_rows; + + // do shrink for the top level LoD + if (x_tensor.lod().size() > 0 && + x_tensor.lod()[0].size() > static_cast(dst_num_rows)) { + auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, + dst_num_rows, 0); + height = lod_offset.second.second; + auto out_lod = out_tensor.mutable_lod(); + framework::AppendLoD(out_lod, lod_offset.first); + } + + if (dst_num_rows != 0) { + out_tensor.ShareDataWith(x_tensor.Slice(0, height)); + } + } +}; + +class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) The RNN step memory to be shrinked."); + AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN."); + AddInput("I", + "(LoDTensor) The step index. The RNN step memory 'X' will be " + "shrinked to match the size of the input of the index'th step."); + AddOutput("Out", "(LoDTensor) The shrinked RNN step memory."); + AddComment(R"DOC( +This operator is used to shrink output batch of memory defined in dynamic RNN. + +Dynamic RNN is able to handle variable-length sequences, in which, sequences in +a mini-batch are sorted by their lengths first. After that, the longest sequence +becomes the first one in the sorted batch, followed by the second longest, the +third longest, and so on. Dynamic RNN then slices a batch input timestep by +timestep from the sorted input. Once any sequence in the input batch reaches its +end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input +batch size for the next time step. +)DOC"); + } +}; + +class ShrinkRNNMemoryInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasInput("I")); + PADDLE_ENFORCE(context->HasInput("RankTable")); + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ShrinkRNNMemoryGradOp : public ArrayOp { + public: + ShrinkRNNMemoryGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out"))); + auto *dx_var = scope.FindVar(Output(framework::GradVarName("X"))); + PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr"); + auto *x_var = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x_var != nullptr); + + auto &x_tensor = x_var->Get(); + auto &dx_tensor = *dx_var->GetMutable(); + dx_tensor.Resize(x_tensor.dims()); + dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + if (dout_var == nullptr) { // dx_tensor fill zero + math::set_constant(dev_ctx, &dx_tensor, 0.0f); + } else { + auto &dout_tensor = dout_var->Get(); + auto height = dout_tensor.dims()[0]; + auto slice = dx_tensor.Slice(0, static_cast(height)); + framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); + if (dx_tensor.dims()[0] > height) { + auto rest_tensor = dx_tensor.Slice( + static_cast(height), static_cast(dx_tensor.dims()[0])); + math::set_constant(dev_ctx, &rest_tensor, 0.0f); + } + } + dx_tensor.set_lod(x_tensor.lod()); + } +}; + +class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X")); + PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X"))); + context->SetOutputDim(framework::GradVarName("X"), + context->GetInputDim("X")); + context->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("shrink_rnn_memory_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp, + ops::ShrinkRNNMemoryInferShape, + ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker); +REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp, + ops::ShrinkRNNMemoryGradInferShape); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3188415a2bd4434704ca95b92427094023527019 --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], + "The 2nd dimension of Input(X) and Input(Label) should " + "be equal."); + + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SigmoidCrossEntropyWithLogitsGradOp + : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2, + "Input(Label)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dout_dims.size(), 2, + "Input(Out@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], + "The 1st dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], + "The 2nd dimension of Input(X) and Input(Label) should " + "be equal."); + PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0], + "The 1st dimension of Input(X) and Input(Out@Grad) " + "should be equal."); + PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1], + "The 2nd dimension of Input(X) and Input(Out@Grad) " + "should be equal."); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } +}; + +class SigmoidCrossEntropyWithLogitsOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto, + OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape N x D, " + "where N is the batch size and D is the number of classes. " + "This input is a tensor of logits computed by the previous " + " operator. Logits are unscaled log probabilities given as " + "log(p/(1-p))."); + AddInput("Label", + "(Tensor, default Tensor), a 2-D tensor of the same type " + "and shape as X. This input is a tensor of probabalistic labels " + "for each logit"); + AddOutput("Out", + "(Tensor, default Tensor), a 2-D tensor with shape N x D " + " of elementwise logistic losses."); + AddComment(R"DOC( +SigmoidCrossEntropyWithLogits Operator. + +This measures the element-wise probability error in classification tasks +in which each class is independent. This can be thought of as predicting labels +for a data-point, where labels are not mutually exclusive. +For example, a news article can be about politics, technology or sports +at the same time or none of these. + +The logistic loss is given as follows: + + $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$ + +We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get: + + $$loss = X - X * Labels + \log(1 + \exp(-X))$$ + +For stability and to prevent overflow of $$\exp(-X)$$ when X < 0, +we reformulate the loss as follows: + + $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$ + +Both the input `X` and `Labels` can carry the LoD (Level of Details) information. +However the output only shares the LoD with input `X`. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsOp, + ops::SigmoidCrossEntropyWithLogitsOpMaker, + sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradOp); +REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CPUDeviceContext, float>); +REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..daa9d3e4fa5aeba77f770f69d6057ced98741eaa --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h new file mode 100644 index 0000000000000000000000000000000000000000..977849f7627bc3f5b08a5f34bd300ab1442c6276 --- /dev/null +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) +template +class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *Labels = context.Input("Label"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto labels = framework::EigenVector::Flatten(*Labels); + auto out = framework::EigenVector::Flatten(*Out); + auto &place = *context.device_context().eigen_device(); + + // term1 = max(x, 0) + auto term1 = x.cwiseMax(static_cast(0)); + // term2 = x * labels + auto term2 = x * labels; + // term3 = log(1 + exp(-abs(x))) + auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); + + out.device(place) = term1 - term2 + term3; + } +}; + +// dX = sigmoid(X) - labels +template +class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *Labels = context.Input("Label"); + const framework::Tensor *dOut = + context.Input(framework::GradVarName("Out")); + framework::Tensor *dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto labels = framework::EigenVector::Flatten(*Labels); + auto dout = framework::EigenVector::Flatten(*dOut); + auto dx = framework::EigenVector::Flatten(*dX); + auto &place = + *context.template device_context().eigen_device(); + + auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); + dx.device(place) = dout * (sigmoid_x - labels); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..54b962538b8426141c9ab1b9269c0ed8bd5a8496 --- /dev/null +++ b/paddle/fluid/operators/sign_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sign_op.h" + +namespace paddle { +namespace operators { + +class SignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SignOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class SignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SignOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of sign operator."); + AddOutput("Out", "(Tensor) Output tensor of sign operator."); + AddComment(R"DOC( +Sign operator + +$$Out = X.sign()$$ +)DOC"); + } +}; + +class SignGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("scale", 0.0f); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, + ops::SignGradMaker); +REGISTER_OP_CPU_KERNEL( + sign, ops::SignKernel); diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..93cdb311eb4961a7754f9adfe14a15f3b2d0ca58 --- /dev/null +++ b/paddle/fluid/operators/sign_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sign_op.h" + +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel); diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c2ebebee40d9b64dd8b658b904e631ba294e41e --- /dev/null +++ b/paddle/fluid/operators/sign_op.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class SignKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* out = context.Output("Out"); + auto* in = context.Input("X"); + out->mutable_data(in->place()); + + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& place = + *context.template device_context().eigen_device(); + eigen_out.device(place) = eigen_in.sign(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..be4c7a56a84e84c39a578b958fe7c9ad551f54f6 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" + +namespace paddle { +namespace operators { + +class SmoothL1LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(x_dims, y_dims); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The tensor rank of Input(X) should not be less than 2."); + if (ctx->HasInput("InsideWeight")) { + PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"), + "If weights are provided, must specify both " + "inside and outside weights."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims); + } + + ctx->SetOutputDim("Diff", x_dims); + // loss is a two-rank tensor + ctx->SetOutputDim("Out", {x_dims[0], 1}); + } +}; + +template +class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The input value of smooth l1 loss op with shape " + "[batch_size, dim1, ..., dimN]."); + AddInput("Y", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The target value of smooth l1 loss op with same shape as X."); + AddInput("InsideWeight", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the result of (X - Y) will be multiplied " + "by this tensor element by element.") + .AsDispensable(); + AddInput("OutsideWeight", + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the out smooth l1 loss will be multiplied by this " + "tensor element by element.") + .AsDispensable(); + AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).") + .AsIntermediate(); + AddOutput("Out", + "(Tensor, default Tensor) A tensor with rank be 2. " + "The output smooth l1 loss with shape [batch_size, 1]."); + AddAttr("sigma", + "Hyper parameter of smooth l1 loss op." + "A float scalar with default value 3.0.") + .SetDefault(3.0); + AddComment(R"DOC( +Smooth L1 Loss Operator. + +This operator computes the smooth l1 loss for X and Y. +The operator takes the first dimension of X and Y as batch size. +For each instance, it computes the smooth l1 loss element by element first +and then sums all the losses. So the shape of Out is [batch_size, 1]. + +The equation is: +$$ +Out_{\sigma}(X, Y)_i = \begin{cases} +0.5 * (\sigma * (X_i - Y_i)) ^ 2 +\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\ +\frac{|X_i - Y_i| - 0.5}{{\sigma}^2}, +\quad otherwise +\end{cases} +$$ + +In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith +element of Out, X and Y. + +)DOC"); + } +}; + +class SmoothL1LossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto in_dims = ctx->GetInputDim("X"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(out_dims.size(), 2, + "The tensor rank of Input(Out@Grad) should be 2."); + PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0], + "The 1st dimension of Input(Out@Grad) must be " + "same as input."); + PADDLE_ENFORCE_EQ(out_dims[1], 1, + "The 2nd dimension of Input(Out@Grad) must be 1."); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, in_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, in_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, + ops::SmoothL1LossOpMaker, smooth_l1_loss_grad, + ops::SmoothL1LossGradOp); +REGISTER_OP_CPU_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CPU_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..94c0d6cd299075541f0ef66cbc0bd48a8f4d51b3 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..325ad824e1874281873b5e41ab62db0fa43040d0 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SmoothL1LossForward { + HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val < 1.0 / sigma2) { + return 0.5 * val * val * sigma2; + } else { + return abs_val - 0.5 / sigma2; + } + } + + T sigma2; +}; + +template +class SmoothL1LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* in2 = context.Input("InsideWeight"); + auto* in3 = context.Input("OutsideWeight"); + auto* out0 = context.Output("Diff"); + auto* out1 = context.Output("Out"); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto* place = + context.template device_context().eigen_device(); + + auto sigma = static_cast(context.Attr("sigma")); + T sigma2 = sigma * sigma; + bool has_weight = (in2 != nullptr) && (in3 != nullptr); + + auto x = EigenVector::Flatten(*in0); + auto y = EigenVector::Flatten(*in1); + auto diff = EigenVector::Flatten(*out0); + + diff.device(*place) = x - y; + // multiply inside weight + if (has_weight) { + auto inside_weight = EigenVector::Flatten(*in2); + // cache diff, reused in bp + diff.device(*place) = diff * inside_weight; + } + + auto in_counts = in0->numel(); + Tensor ptensor_errors; + ptensor_errors.mutable_data({static_cast(in_counts)}, + context.GetPlace()); + auto errors = EigenVector::Flatten(ptensor_errors); + // apply smooth l1 forward + errors.device(*place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); + + // multiply outside weight + if (has_weight) { + auto outside_weight = EigenVector::Flatten(*in3); + errors.device(*place) = errors * outside_weight; + } + auto loss = EigenVector::Flatten(*out1); + // first dimension of 'X' is the number of samples + auto mat_dims = + framework::make_ddim({static_cast(in0->dims()[0]), + static_cast(in_counts / in0->dims()[0])}); + auto errors_mat_view = EigenMatrix::From(ptensor_errors, mat_dims); + loss.device(*place) = errors_mat_view.sum(Eigen::array({{1}})); + } +}; + +template +struct SmoothL1LossBackward { + HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {} + + HOSTDEVICE T operator()(const T& val) const { + T abs_val = std::abs(val); + if (abs_val < 1.0 / sigma2) { + return sigma2 * val; + } else { + return (0 < val) - (val < 0); + } + } + + T sigma2; +}; + +template +class SmoothL1LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("InsideWeight"); + auto* in1 = context.Input("OutsideWeight"); + auto* in2 = context.Input("Diff"); + auto* og = context.Input(framework::GradVarName("Out")); + auto sigma = static_cast(context.Attr("sigma")); + T sigma2 = sigma * sigma; + bool has_weight = (in0 != nullptr) && (in1 != nullptr); + + auto* place = + context.template device_context().eigen_device(); + + auto in_dims = in2->dims(); + auto counts = in2->numel(); + auto cols = counts / in_dims[0]; + auto mat_dims = framework::make_ddim( + {static_cast(in_dims[0]), static_cast(cols)}); + + Tensor ptensor_diff; + ptensor_diff.mutable_data({static_cast(counts)}, + context.GetPlace()); + auto diff = EigenVector::Flatten(ptensor_diff); + // apply smooth l1 backwoard + diff.device(*place) = EigenVector::Flatten(*in2).unaryExpr( + SmoothL1LossBackward(sigma2)); + + // compute weights + Tensor ptensor_weights; + ptensor_weights.mutable_data(mat_dims, context.GetPlace()); + auto weights = EigenMatrix::From(ptensor_weights); + // initialize to 1.0 + weights.device(*place) = weights.constant(static_cast(1.0)); + if (has_weight) { + auto inside_weight = EigenMatrix::From(*in0, mat_dims); + auto outside_weight = EigenMatrix::From(*in1, mat_dims); + weights.device(*place) = inside_weight * outside_weight; + } + + // compute gradients + auto out_grad = EigenMatrix::From(*og); + auto diff_mat_view = EigenMatrix::From(ptensor_diff, mat_dims); + auto gradients = out_grad.broadcast( + Eigen::array({{1, static_cast(cols)}})) * + weights * diff_mat_view; + + auto* out0 = context.Output(framework::GradVarName("X")); + auto* out1 = context.Output(framework::GradVarName("Y")); + + if (out0) { + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenMatrix::From(*out0, mat_dims); + x_grad.device(*place) = gradients; + } + + if (out1) { + out1->mutable_data(context.GetPlace()); + auto y_grad = EigenMatrix::From(*out1, mat_dims); + y_grad.device(*place) = -1 * gradients; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d9462d08b9cc06df2d0dca568dbbe1c50dc948f --- /dev/null +++ b/paddle/fluid/operators/softmax_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +class SoftmaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SoftmaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SoftmaxOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(x_dims.size() == 2UL, + "The input of softmax op must be a matrix."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "The input tensor of softmax. " + "2-D with shape [batch_size, input_feature_dimensions]."); + AddOutput("Out", "The normalized values with the same shape as X."); + AddComment(R"DOC( +Softmax Operator. + +The input of the softmax operator is a 2-D tensor with shape N x K (N is the +batch_size, K is the dimension of input feature). The output tensor has the +same shape as the input tensor. + +For each row of the input tensor, the softmax operator squashes the +K-dimensional vector of arbitrary real values to a K-dimensional vector of real +values in the range [0, 1] that add up to 1. +It computes the exponential of the given dimension and the sum of exponential +values of all the other dimensions in the K-dimensional vector input. +Then the ratio of the exponential of the given dimension and the sum of +exponential values of all the other dimensions is the output of the softmax +operator. + +For each row $i$ and each column $j$ in Input(X), we have: + $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$ + +)DOC"); + } +}; + +class SoftmaxOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"), + ctx->GetInputDim(framework::GradVarName("Out")), + "Input(Out) and its gradients should have a same shape."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad, + ops::SoftmaxOpGrad); +REGISTER_OP_CPU_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c53d8a2bc82dcfafc178b67299769b2e06109eb3 --- /dev/null +++ b/paddle/fluid/operators/softmax_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9287f0231031675b09c941f19c1df1fefc993506 --- /dev/null +++ b/paddle/fluid/operators/softmax_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SoftmaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Output("Out"); + + // allocate memory on device. + Out->mutable_data(context.GetPlace()); + + math::SoftmaxFunctor()( + context.template device_context(), X, Out); + } +}; + +template +class SoftmaxGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* Out = context.Input("Out"); + auto* dOut = context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + + // allocate memory on device. + dX->mutable_data(context.GetPlace()); + + math::SoftmaxGradFunctor()( + context.template device_context(), Out, dOut, dX); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..79d56cb97d38ebd725668442c29229ef22f5b05e --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -0,0 +1,204 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class SoftmaxWithCrossEntropyOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "(Tensor, default: Tensor), The unscaled log probabilities " + "which is a 2-D tensor with shape [N x K]. N is the batch_size, " + "and K is the class number."); + AddInput("Label", + "(Tensor) The ground truth which is a 2-D tensor. If soft_label " + "is set to false, Label is a Tensor with shape [N x 1]. If " + "soft_label is set to true, Label is a Tensor with " + "shape [N x K]."); + AddOutput( + "Softmax", + "(Tensor, default: Tensor), A 2-D tensor with shape [N x K]. " + "The outputs value of softmax activation by given the input batch, " + "which will be used in backward calculation.") + .AsIntermediate(); + AddOutput("Loss", + "(Tensor, default: Tensor), A 2-D tensor. The cross " + "entropy loss with shape [N x 1]."); + AddAttr( + "soft_label", + "(bool, default: false), A flag to indicate whether to interpretate " + "the given labels as soft labels.") + .SetDefault(false); + AddComment(R"DOC( +Softmax With Cross Entropy Operator. + +Cross entropy loss with softmax is used as the output layer extensively. This +operator computes the softmax normalized values for each row of the input +tensor, after which cross-entropy loss is computed. This provides a more +numerically stable gradient. + +Because this operator performs a softmax on logits internally, it expects +unscaled logits. This operator should not be used with the output of +softmax operator since that would produce incorrect results. + +When the attribute soft_label is set false, this operators expects mutually +exclusive hard labels, each sample in a batch is in exactly one class with a +probability of 1.0. Each sample in the batch will have a single label. + +The equation is as follows: + +1) Hard label (one-hot label, so every sample has exactly one class) + +$$Loss_j = -\text{Logit}_{Label_j} + +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), +j = 1,..., K$$ + +2) Soft label (each sample can have a distribution over all classes) + +$$Loss_j = -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i - +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), +j = 1,...,K$$ + +)DOC"); + } +}; + +class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Softmax"), + "Output(Softmax) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ( + logits_dims.size(), 2UL, + "The input of softmax_with_cross_entropy should be a 2-D tensor."); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL, + "The labels should be a 2-D tensor."); + + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1], + "If Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(labels_dims[1], 1UL, + "If Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim("Softmax", logits_dims); + ctx->SetOutputDim("Loss", {logits_dims[0], 1}); + + ctx->ShareLoD("Logits", /*->*/ "Softmax"); + ctx->ShareLoD("Logits", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@Grad) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Softmax"), + "Input(Softmax) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@Grad) should be not null."); + + auto softmax_dims = ctx->GetInputDim("Softmax"); + auto labels_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL, + "The labels should be a 2-D tensor."); + + if (ctx->Attrs().Get("soft_label")) { + PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1], + "When Attr(soft_label) == true, the 2nd dimension of " + "Input(X) and Input(Label) should be equal."); + } else { + PADDLE_ENFORCE_EQ(labels_dims[1], 1UL, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(Label) should be 1."); + } + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Softmax")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input(framework::GradVarName("Loss"))->type()), + ctx.device_context()); + } +}; + +class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("softmax_with_cross_entropy_grad"); + grad_op->SetInput("Label", Input("Label")); + grad_op->SetInput("Softmax", Output("Softmax")); + grad_op->SetInput("Loss", Output("Loss")); + grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax")); + grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, + ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker); +REGISTER_OPERATOR(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyOpGrad); +REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyKernel, + ops::SoftmaxWithCrossEntropyKernel); +REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradKernel, + ops::SoftmaxWithCrossEntropyGradKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..410d9e8887c593249495e08424467b4be15c9bcb --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +namespace { +template +__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, + const int64_t* labels, const int batch_size, + const int class_num) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int sample_idx = tid / class_num; + + if (tid < batch_size) { + PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num); + logit_grad[tid * class_num + labels[tid]] -= static_cast(1.); + } + + __syncthreads(); + + if (tid < batch_size * class_num) { + logit_grad[tid] *= loss_grad[sample_idx]; + } +} + +template +__global__ void SoftCrossEntropyGradientKernel(T* logit_grad, + const T* loss_grad, + const T* labels, + const int batch_size, + const int class_num) { + int ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < batch_size * class_num) { + int row_ids = ids / class_num; + logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]); + } +} +} // namespace + +template +class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Label"); + Tensor* softmax = context.Output("Softmax"); + + Tensor* loss = context.Output("Loss"); + softmax->mutable_data(context.GetPlace()); + loss->mutable_data(context.GetPlace()); + + math::SoftmaxFunctor()( + context.cuda_device_context(), logits, softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, + context.Attr("soft_label")); + } +}; + +template +class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + const Tensor* labels = context.Input("Label"); + const T* loss_grad_data = + context.Input(framework::GradVarName("Loss"))->data(); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + logit_grad->ShareDataWith(*context.Input("Softmax")); + T* logit_grad_data = logit_grad->data(); + + const int batch_size = logit_grad->dims()[0]; + const int class_num = logit_grad->dims()[1]; + int block = 512; + int grid = (batch_size * class_num + block - 1) / block; + + if (context.Attr("soft_label")) { + const T* label_data = labels->data(); + SoftCrossEntropyGradientKernel< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); + } else { + const int64_t* label_data = labels->data(); + CrossEntropyGrad< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0927efd42ceb35cc4183f84d160c44f35f6cc3f5 --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), + "This kernel only runs on CPU."); + const Tensor* logits = context.Input("Logits"); + const Tensor* labels = context.Input("Label"); + Tensor* softmax = context.Output("Softmax"); + Tensor* loss = context.Output("Loss"); + + softmax->mutable_data(context.GetPlace()); + loss->mutable_data(context.GetPlace()); + + auto& dev_ctx = + context.template device_context(); + math::SoftmaxFunctor()(dev_ctx, logits, + softmax); + math::CrossEntropyFunctor()( + dev_ctx, loss, softmax, labels, context.Attr("soft_label")); + } +}; + +template +class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* out_grad = + context.Input(framework::GradVarName("Loss")); + const Tensor* labels = context.Input("Label"); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + logit_grad->ShareDataWith(*context.Input("Softmax")); + + const int class_num = logit_grad->dims()[1]; + auto out_grad_mat = EigenMatrix::From(*out_grad); + auto logit_grad_mat = EigenMatrix::From(*logit_grad); + auto& place = *context.template device_context() + .eigen_device(); + if (context.Attr("soft_label")) { + auto lbl_mat = EigenMatrix::From(*labels); + logit_grad_mat.device(place) = + out_grad_mat.broadcast(Eigen::DSizes(1, class_num)) * + (logit_grad_mat - lbl_mat); + } else { + logit_grad_mat.device(place) = + logit_grad_mat * + out_grad_mat.broadcast(Eigen::DSizes(1, class_num)); + + const int batch_size = logit_grad->dims()[0]; + const int64_t* label_data = labels->data(); + T* logit_grad_data = logit_grad->data(); + const T* out_grad_data = out_grad->data(); + for (int i = 0; i < batch_size; ++i) { + logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f821dc54d7bbe697d3642e64dc1628ec7d966592 --- /dev/null +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct CopyRange { + size_t begin; + size_t end; +}; + +using LoD = framework::LoD; + +class SplitLoDTensorOp : public framework::OperatorBase { + public: + SplitLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &mask = scope.FindVar(Input("Mask"))->Get(); + auto *out_true = + scope.FindVar(Output("OutTrue"))->GetMutable(); + auto *out_false = + scope.FindVar(Output("OutFalse"))->GetMutable(); + auto level = static_cast(Attr("level")); + auto &x_lod = x.lod(); + auto &mask_dim = mask.dims(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + + std::unique_ptr cpu_mask{new framework::LoDTensor()}; + if (platform::is_cpu_place(mask.place())) { + cpu_mask->ShareDataWith(mask); + } else if (platform::is_gpu_place(mask.place())) { +#ifdef PADDLE_WITH_CUDA + framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); +#else + PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); +#endif + } + auto *mask_data = cpu_mask->data(); + + std::vector> copy_ranges(mask_dim[0]); + + // set out_true/out_false lod + for (size_t t = 0; t < 2; t++) { + LoD *lod = nullptr; + if (t == 0) { + lod = out_false->mutable_lod(); + } else { + lod = out_true->mutable_lod(); + } + lod->clear(); + for (size_t i = 0; i < static_cast(mask_dim[0]); i++) { + if (static_cast(mask_data[i]) == t) { + size_t start_idx = i; + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x_lod, start_idx, start_idx + 1, level); + + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); + } + } + } + + for (size_t t = 0; t < 2; ++t) { + framework::LoDTensor *out; + if (t == 0) { + out = out_false; + } else { + out = out_true; + } + auto &ranges = copy_ranges[t]; + size_t height = std::accumulate( + ranges.begin(), ranges.end(), 0UL, + [](size_t a, const CopyRange &b) { return a + b.end - b.begin; }); + auto x_dim = x.dims(); + x_dim[0] = static_cast(height); + out->Resize(x_dim); + out->mutable_data(x.place(), x.type()); + size_t offset = 0; + for (auto &each_range : ranges) { + size_t len = each_range.end - each_range.begin; + if (len == 0) { + continue; + } + // out[offset: offset+len] = x[each_range.begin: each_range.end] + auto slice = out->Slice(static_cast(offset), + static_cast(offset + len)); + framework::Copy(x.Slice(static_cast(each_range.begin), + static_cast(each_range.end)), + x.place(), dev_ctx, &slice); + offset += len; + } + } + } +}; + +class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input LoDTensor"); + AddInput("Mask", "A bool column vector which mask the input"); + AddOutput("OutTrue", "True branch of input LoDTensor"); + AddOutput("OutFalse", "False branch of input LoDTensor"); + AddAttr("level", "(int) the specific lod level to split.") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment( + R"DOC( + Split a LoDTensor with a Mask at certain level. The input LoDTensor + has 3 sequence at certain lod level. The Mask is a bool column vector, + such as [0, 1, 0] at the same level. The first and third sequence will + be send to False Output LoDTensor; whereas the second sequence will + be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC"); + } +}; + +class SplitLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "SplitLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("Mask"), + "SplitLoDTensorOp must has input Mask."); + PADDLE_ENFORCE(context->HasOutput("OutTrue"), + "SplitLoDTensorOp must has output OutTrue."); + PADDLE_ENFORCE(context->HasOutput("OutFalse"), + "SplitLoDTensorOp must has output OutFalse."); + + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ(mask_dim.size(), 2); + PADDLE_ENFORCE_EQ(mask_dim[1], 1); + + context->SetOutputDim("OutTrue", context->GetInputDim("X")); + context->SetOutputDim("OutFalse", context->GetInputDim("X")); + } +}; + +class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("merge_lod_tensor"); + grad_op->SetInput("InTrue", OutputGrad("OutTrue")); + grad_op->SetInput("InFalse", OutputGrad("OutFalse")); + grad_op->SetInput("Mask", Input("Mask")); + grad_op->SetInput("X", Input("X")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp, + ops::SplitLoDTensorOpProtoMaker, + ops::SplitLoDTensorInferShape, + ops::SplitLoDTensorArrayGradMaker); diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8bc22fe1d3d24866b5bf2506ed0ff585d259cc2 --- /dev/null +++ b/paddle/fluid/operators/split_op.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_op.h" +#include "paddle/fluid/operators/net_op.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +class SplitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SplitOp should not be null."); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + "Outputs(Out) of SplitOp should not be empty."); + auto in_dims = ctx->GetInputDim("X"); + auto outs_names = ctx->Outputs("Out"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + size_t num = static_cast(ctx->Attrs().Get("num")); + std::vector sections = static_cast>( + ctx->Attrs().Get>("sections")); + const size_t outs_number = outs_names.size(); + std::vector outs_dims; + outs_dims.reserve(outs_number); + + if (num > 0) { + int64_t in_axis_dim = in_dims[axis]; + PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, + "tensor split does not result" + " in an equal division"); + size_t out_axis_dim = in_axis_dim / num; + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + PADDLE_ENFORCE_EQ(sections.size(), outs_number, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + ctx->SetOutputsDim("Out", outs_dims); + if (axis != 0) { + // Only pass LoD when not spliting along the first dim. + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + } + } +}; + +class SplitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor of the split operator."); + AddOutput("Out", "(Tensor) Output tensors of the split operator.") + .AsDuplicable(); + AddComment(R"DOC( +Split operator + +This operator splits the input tensor into multiple sub-tensors. + +Example: + Input = [[1,2], + [3,4], + [5,6]] + sections = [2,1] + axis = 0 + Output[0] = [[1,2], + [3,4]] + Output[1] = [[5,6]] + + )DOC"); + AddAttr>("sections", + "(vector) " + "the length of each output along the " + "specified axis.") + .SetDefault(std::vector{}); + AddAttr("num", + "(int, default 0)" + "Number of sub-tensors. This must evenly divide " + "Input.dims()[axis]") + .SetDefault(0); + AddAttr("axis", + "(int, default 0) " + "The axis which the input will be splited on.") + .SetDefault(0); + } +}; + +class SplitGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto op = new framework::OpDesc(); + op->SetType("concat"); + op->SetInput("X", OutputGrad("Out")); + op->SetOutput("Out", InputGrad("X")); + op->SetAttrMap(Attrs()); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +USE_CPU_ONLY_OP(concat); + +REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); +REGISTER_OP_CPU_KERNEL(split, + ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..279691c759e988ea26a62e1263198d2a9d878cf9 --- /dev/null +++ b/paddle/fluid/operators/split_op.cu.cc @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + split, ops::SplitOpKernel); diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e78218f2fb108dac5bce717e03ce0aba1ed88195 --- /dev/null +++ b/paddle/fluid/operators/split_op.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +template +class SplitOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + auto in_stride = framework::stride(in->dims()); + int64_t axis = static_cast(ctx.Attr("axis")); + const size_t n = outs.size(); + size_t input_offset = 0; + for (size_t i = 0; i < n; i++) { + auto& out = outs[i]; + out->mutable_data(ctx.GetPlace()); + size_t axis_dim = out->dims()[axis]; + auto out_stride = framework::stride(out->dims()); + StridedMemcpy(ctx.device_context(), in->data() + input_offset, + in_stride, out->dims(), out_stride, out->data()); + input_offset += axis_dim * in_stride[axis]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..113ce2ce109778a355130aaf686261c1f71c0980 --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_selected_rows_op.h" + +namespace paddle { +namespace operators { + +class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input SelectedRows."); + AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable(); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + + AddComment(R"DOC( +Split a SelectedRows with a specified rows section. +height_sections is only needed when need to split the dims of the original tensor. + +Example: + Input: + X.rows = {7, 5} + X.height = 12 + Attr: + height_sections = {4, 8} + Out: + out0.rows = {} + out0.height = 4 + + out1.rows = {5, 7} + out2.height = 8 + +)DOC"); + } +}; + +class SplitSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "SplitSelectedRowsOp must has output Out."); + + std::vector height_sections = + ctx->Attrs().Get>("height_sections"); + int64_t n = ctx->Outputs("Out").size(); + + std::vector outs_dims; + outs_dims.reserve(n); + + // make output dims + for (int64_t i = 0; i < n; ++i) { + auto dims = ctx->GetInputDim("X"); + if (height_sections.size()) { + PADDLE_ENFORCE_EQ( + height_sections.size(), static_cast(n), + "The size of height section should be the same with height" + " section size."); + dims[0] = height_sections[i]; + } + outs_dims.push_back(dims); + } + ctx->SetOutputsDim("Out", outs_dims); + } +}; + +class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("sum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp, + ops::SplitSelectedRowsOpMaker, + ops::SplitSelectedRowsGradMaker); +REGISTER_OP_CPU_KERNEL( + split_selected_rows, + ops::SplitSelectedRowsOpKernel); diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/split_selected_rows_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0bbf1ecfaefeddd426b1055d93ce39a138abec28 --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_selected_rows_op.h" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + split_selected_rows, + ops::SplitSelectedRowsOpKernel); diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h new file mode 100644 index 0000000000000000000000000000000000000000..527264bd675520a98b442380e2d1ec259964e92e --- /dev/null +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +static int FindOutIdx(int row, const std::vector& height_sections) { + int offset = 0; + for (size_t i = 0; i < height_sections.size(); ++i) { + if (row >= offset && row < (offset + height_sections[i])) { + return i; + } + offset += height_sections[i]; + } + return -1; +} + +template +class SplitSelectedRowsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + auto height_sections = ctx.Attr>("height_sections"); + + auto x_rows = x->rows(); + std::vector> outs_rows_idx; + outs_rows_idx.resize(outs.size()); + + auto row_numel = x->value().numel() / x->value().dims()[0]; + auto src = x->value().data(); + + for (size_t i = 0; i < x_rows.size(); ++i) { + int out_idx = FindOutIdx(x_rows[i], height_sections); + outs_rows_idx[out_idx].push_back(i); + } + auto place = ctx.GetPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + if (rows_idx.size() > 0) { + auto dims = x->GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_value()->mutable_data(dims, x->place()); + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(x_rows[idx]); + } + auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy(platform::CPUPlace(), dst + j * row_numel, + platform::CPUPlace(), src + rows_idx[j] * row_numel, + sizeof(T) * row_numel); + } else { +#ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), src + rows_idx[j] * row_numel, + sizeof(T) * row_numel, stream); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6755b12000463b111fe65dbdf2c140a060d968b --- /dev/null +++ b/paddle/fluid/operators/spp_op.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/spp_op.h" +namespace paddle { +namespace operators { + +class SppOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SppOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of spp operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of spp operator." + "N * M." + "M = C * H * W"); + AddAttr("pyramid_height", "(int), multi level pooling"); + AddAttr( + "pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddComment(R"DOC( + "With spatial pyramid pooling, the input image can + be of any sizes. This not only allows arbitrary aspect + ratios, but also allows arbitrary scales. We can resize + the input image to any scale (e.g., min(w, h)=180, 224, + ...) and apply the same deep network. When the + input image is at different scales, the network (with + the same filter sizes) will extract features at different + scales. The scales play important roles in traditional + methods. + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(H_{out}, W_{out})$ + Where + $$ + H_{out} = N \\ + W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in} + $$ + paper https://arxiv.org/pdf/1406.4729v4.pdf + )DOC"); + } +}; + +class SppOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SppOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SppOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int pyramid_height = ctx->Attrs().Get("pyramid_height"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Spping intput must be of 4-dimensional."); + int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1]; + std::vector output_shape({in_x_dims[0], outlen}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class SppOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad); +REGISTER_OP_CPU_KERNEL( + spp, ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CPU_KERNEL( + spp_grad, ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..cad2ca5ef8e16ee6ea6943f61da33367099e0937 --- /dev/null +++ b/paddle/fluid/operators/spp_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/spp_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + spp, ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CUDA_KERNEL( + spp_grad, ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1da1f805807fc648b3a54de91842f163b356435b --- /dev/null +++ b/paddle/fluid/operators/spp_op.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { +template +class SppKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + auto* out = context.Output("Out"); + int pyramid_height = context.template Attr("pyramid_height"); + std::string pooling_type = + context.template Attr("pooling_type"); + out->mutable_data(context.GetPlace()); + auto out_stride = framework::stride(out->dims()); + int input_h = in_x->dims()[2]; + int input_w = in_x->dims()[3]; + size_t output_offset = 0; + for (int p = 0; p < pyramid_height; ++p) { + int bins = std::pow(2, p); + int kernel_size_h = std::ceil(input_h / static_cast(bins)); + int kernel_size_w = std::ceil(input_w / static_cast(bins)); + int padding_h = (kernel_size_h * bins - input_h + 1) / 2; + int padding_w = (kernel_size_w * bins - input_w + 1) / 2; + std::vector kernel_size({kernel_size_h, kernel_size_w}); + std::vector strides({kernel_size_h, kernel_size_w}); + std::vector paddings({padding_h, padding_w}); + // pooling output shape + framework::Tensor out_level; + std::vector output_shape_vec( + {in_x->dims()[0], in_x->dims()[1], bins, bins}); + framework::DDim output_shape(framework::make_ddim(output_shape_vec)); + out_level.mutable_data(output_shape, context.GetPlace()); + // pooling + if (pooling_type == "max") { + math::Pool2dFunctor, T> pool_forward; + math::MaxPool max_process; + pool_forward(context.template device_context(), *in_x, + kernel_size, strides, paddings, max_process, &out_level); + } else if (pooling_type == "avg") { + math::Pool2dFunctor, T> pool_forward; + math::AvgPool avg_process; + pool_forward(context.template device_context(), *in_x, + kernel_size, strides, paddings, avg_process, &out_level); + } + // flatten pooling output shape + int output_flatten_w = in_x->dims()[1] * bins * bins; + std::vector output_flatten_shape_vec( + {in_x->dims()[0], output_flatten_w}); + framework::DDim output_flatten_shape( + framework::make_ddim(output_flatten_shape_vec)); + out_level.Resize(output_flatten_shape); + // concat + auto out_level_stride = framework::stride(out_level.dims()); + StridedMemcpy(context.template device_context(), + out_level.data(), out_level_stride, out_level.dims(), + out_stride, out->data() + output_offset); + output_offset += out_level.dims()[1] * out_level_stride[1]; + } + } +}; +template +class SppGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* out = context.Input("Out"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + int pyramid_height = context.template Attr("pyramid_height"); + std::string pooling_type = + context.template Attr("pooling_type"); + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0)); + auto out_stride = framework::stride(out->dims()); + int input_h = in_x->dims()[2]; + int input_w = in_x->dims()[3]; + size_t out_offset = 0; + for (int p = 0; p < pyramid_height; ++p) { + int bins = std::pow(2, p); + int kernel_size_h = std::ceil(input_h / static_cast(bins)); + int kernel_size_w = std::ceil(input_w / static_cast(bins)); + int padding_h = (kernel_size_h * bins - input_h + 1) / 2; + int padding_w = (kernel_size_w * bins - input_w + 1) / 2; + std::vector kernel_size({kernel_size_h, kernel_size_w}); + std::vector strides({kernel_size_h, kernel_size_w}); + std::vector paddings({padding_h, padding_w}); + // split out and outgrad ... to flatten + framework::Tensor out_level; + framework::Tensor outgrad_level; + int out_flatten_w = in_x->dims()[1] * bins * bins; + std::vector out_flatten_shape_vec( + {in_x->dims()[0], out_flatten_w}); + framework::DDim out_flatten_shape( + framework::make_ddim(out_flatten_shape_vec)); + out_level.mutable_data(out_flatten_shape, context.GetPlace()); + outgrad_level.mutable_data(out_flatten_shape, context.GetPlace()); + auto flatten_stride = framework::stride(out_level.dims()); + // memcpy + StridedMemcpy(context.template device_context(), + out->data() + out_offset, out_stride, + out_level.dims(), flatten_stride, out_level.data()); + + StridedMemcpy(context.template device_context(), + out_grad->data() + out_offset, out_stride, + outgrad_level.dims(), flatten_stride, + outgrad_level.data()); + out_offset += out_level.dims()[1] * out_stride[1]; + // flatten backward to nchw + + std::vector out_shape_vec({in_x->dims()[0], in_x->dims()[1]}); + out_shape_vec.push_back( + (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1); + out_shape_vec.push_back( + (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out_level.ShareDataWith(out_level); + out_level.Resize(out_shape); + outgrad_level.ShareDataWith(outgrad_level); + outgrad_level.Resize(out_shape); + // pooling backward + if (pooling_type == "max") { + math::MaxPool2dGradFunctor pool2d_backward; + pool2d_backward(context.template device_context(), *in_x, + *&out_level, *&outgrad_level, kernel_size, strides, + paddings, in_x_grad); + } else if (pooling_type == "avg") { + math::Pool2dGradFunctor, T> + pool_backward; + math::AvgPoolGrad avg_process; + pool_backward(context.template device_context(), *in_x, + *&out_level, *&outgrad_level, kernel_size, strides, + paddings, avg_process, in_x_grad); + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1d0c2c7f392cae3dc30611a0f077c1af7b68cbe --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squared_l2_distance_op.h" + +namespace paddle { +namespace operators { + +class SquaredL2DistanceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("sub_result"), + "Output(sub_result) of SquaredL2DistanceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SquaredL2DistanceOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims), + "Tensor rank of both SquaredL2DistanceOp's " + "inputs must be same."); + + int rank = framework::arity(x_dims); + PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2."); + PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0], + "Product of dimensions expcet the first dimension of " + "input and target must be equal."); + PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0], + "First dimension of target must be equal to input " + "or to 1."); + + ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]}); + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input of SquaredL2DistanceOp."); + AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp."); + AddOutput("sub_result", + "(Tensor) Buffering subtraction result which " + "will be reused in backward.") + .AsIntermediate(); + AddOutput("Out", "(Tensor) Squared l2 distance between input and target."); + AddComment(R"DOC( +SquaredL2Distance operator + +This operator will cacluate the squared L2 distance for the input and +the target. Number of distance value will be equal to the first dimension +of input. First dimension of the target could be equal to the input or to 1. +If the first dimension of target is 1, the operator will broadcast target's +first dimension to input's first dimension. During backward propagation, +the user can decide whether to calculate the gradient of the input or +the target or both. + +Both the input X and Y can carry the LoD (Level of Details) information. +However, the output only shares the LoD information with input X. + )DOC"); + } +}; + +class SquaredL2DistanceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of Out should not be null"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0], + "First dimension of output gradient and " + "input value must be equal."); + PADDLE_ENFORCE_EQ(out_dims[1], 1, + "Second dimension of output gradient " + "must be 1."); + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims); + if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp, + ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad, + ops::SquaredL2DistanceGradOp); +REGISTER_OP_CPU_KERNEL( + squared_l2_distance, + ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..959e7afac99bd2565890cb6a296bc908f250a16c --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/squared_l2_distance_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + squared_l2_distance, + ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aab241247e5e92f43a997f3d29c8e7d7d44d7711 --- /dev/null +++ b/paddle/fluid/operators/squared_l2_distance_op.h @@ -0,0 +1,125 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +class SquaredL2DistanceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto* in1 = context.Input("Y"); + auto* out0 = context.Output("sub_result"); + auto* out1 = context.Output("Out"); + + auto in0_dims = in0->dims(); + auto in1_dims = in1->dims(); + + int cols = in0->numel() / in0_dims[0]; + // reduce dimensions except the first + auto x = + EigenMatrix::From(*in0, framework::make_ddim({in0_dims[0], cols})); + auto y = + EigenMatrix::From(*in1, framework::make_ddim({in1_dims[0], cols})); + + out0->mutable_data(context.GetPlace()); + out1->mutable_data(context.GetPlace()); + auto sub_result = EigenMatrix::From(*out0); + auto z = EigenVector::Flatten(*out1); + + auto& place = + *context.template device_context().eigen_device(); + auto x_dims = x.dimensions(); + auto y_dims = y.dimensions(); + // buffer the substraction result + if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) { + sub_result.device(place) = + x - + y.broadcast(Eigen::array({{static_cast(x_dims[0]), 1}})); + } else { + sub_result.device(place) = x - y; + } + auto sub_res_pow2 = sub_result * sub_result; + z.device(place) = sub_res_pow2.sum(Eigen::array({{1}})); + } +}; + +template +class SquaredL2DistanceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("sub_result"); + auto* in1 = context.Input(framework::GradVarName("Out")); + auto* x_g = context.Output(framework::GradVarName("X")); + auto* y_g = context.Output(framework::GradVarName("Y")); + + auto sub_result = EigenMatrix::From(*in0); + auto out_grad = EigenMatrix::From(*in1); + + auto x_dims = x_g->dims(); + auto y_dims = y_g->dims(); + + int cols = x_g->numel() / x_dims[0]; + // calculate gradient + auto grad_mat = 2 * + (out_grad.broadcast(Eigen::array({{1, cols}}))) * + sub_result; + + // propagate back to input + auto& eigen_place = + *context.template device_context().eigen_device(); + if (x_g) { + x_g->mutable_data(context.GetPlace()); + // eigen matrix + auto x_grad = + EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); + // dimensions are same with subResult + x_grad.device(eigen_place) = grad_mat; + } + + if (y_g) { + y_g->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0], + "First dimension of gradient must be greater or " + "equal than first dimension of target."); + + if (sub_result.dimensions()[0] == y_dims[0]) { + auto y_grad = + EigenMatrix::From(*y_g, framework::make_ddim({y_dims[0], cols})); + y_grad.device(eigen_place) = -1 * grad_mat; + } else { + auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); + auto y_grad = EigenVector::Flatten(*y_g); + y_grad.device(eigen_place) = col_sum_res; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a43cc22994b1b15a4acf0fd89b956bf05d3f35c8 --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squared_l2_norm_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SquaredL2NormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + ctx->SetOutputDim("Out", {1}); + } +}; + +class SquaredL2NormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; + +class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of squared_l2_norm op."); + AddOutput("Out", "(Scalar) The output of squared_l2_norm op."); + AddComment(R"DOC( +SquaredL2Norm Operator. + +Computes the squared L2 norm of a tensor. + +$$Out = \sum_{i} X_{i}^2$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, + squared_l2_norm_grad, ops::SquaredL2NormGradOp); +REGISTER_OP_CPU_KERNEL( + squared_l2_norm, + ops::SquaredL2NormKernel); +REGISTER_OP_CPU_KERNEL( + squared_l2_norm_grad, + ops::SquaredL2NormGradKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..52f4ab79b2189b269b7d0685dccedc52b627ad8d --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/squared_l2_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + squared_l2_norm, + ops::SquaredL2NormKernel); +REGISTER_OP_CUDA_KERNEL( + squared_l2_norm_grad, + ops::SquaredL2NormGradKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op.h b/paddle/fluid/operators/squared_l2_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..56524636b8f1b063266fda0997e91a703131adff --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// Out = sum(square(X)) +template +class SquaredL2NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + framework::Tensor *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto out = framework::EigenScalar::From(*Out); + auto *place = + context.template device_context().eigen_device(); + + out.device(*place) = x.square().sum(); + } +}; + +// dX = X +template +class SquaredL2NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const framework::Tensor *X = context.Input("X"); + const framework::Tensor *dOut = + context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE(dOut->numel() == 1, + "Squared L2 Norm Gradient should be scalar"); + framework::Tensor *dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(*X); + auto dout = framework::EigenVector::Flatten(*dOut); + auto dx = framework::EigenVector::Flatten(*dX); + auto *place = + context.template device_context().eigen_device(); + + Eigen::DSizes x_dsize(X->numel()); + dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h new file mode 100644 index 0000000000000000000000000000000000000000..8a99b405e266da48427fa23e9a3e67f2bc54c5a0 --- /dev/null +++ b/paddle/fluid/operators/strided_memcpy.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/operators/detail/strided_memcpy.h" + +namespace paddle { +namespace operators { + +// Strided memory copy from src to dst. +// +// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will +// be a segment fault. +// +// The stride of an array (also referred to as increment, pitch or step size) is +// the number of locations in memory between beginnings of successive array +// elements +// +// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the +// stride is [270000, 90000, 300, 1]. +// +// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke +// `dev_ctx.Wait()`. +template +inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, + const framework::DDim& src_stride, + const framework::DDim& dst_dim, + const framework::DDim& dst_stride, T* dst) { + using namespace detail; + StridedCopyDimVisitor func(dev_ctx, src, src_stride, dst_stride, dst); + boost::apply_visitor(func, dst_dim); +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a369941a993c02744cb7de0d6c6c878f56d5e0fe --- /dev/null +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/strided_memcpy.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +TEST(StridedMemcpy, CPUCrop) { + // clang-format off + int src[] = { + 0, 1, 2, 0, 0, + 0, 3, 4, 0, 0, + 0, 0, 0, 0, 0, + }; + // clang-format on + + framework::DDim src_stride({5, 1}); + + int dst[4]; + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({2, 1}); + + platform::CPUDeviceContext ctx; + StridedMemcpy(ctx, src + 1, src_stride, dst_dim, dst_stride, dst); + + ASSERT_EQ(1, dst[0]); + ASSERT_EQ(2, dst[1]); + ASSERT_EQ(3, dst[2]); + ASSERT_EQ(4, dst[3]); +} + +TEST(StridedMemcpy, CPUConcat) { + // clang-format off + int src[] = { + 1, 2, + 3, 4 + }; + // clang-format on + + int dst[8]; + + framework::DDim src_stride({2, 1}); + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({4, 1}); + platform::CPUDeviceContext ctx; + + StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst); + StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst + 2); + + // clang-format off + int expect_dst[] = { + 1, 2, 1, 2, + 3, 4, 3, 4 + }; + // clang-format on + for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { + ASSERT_EQ(expect_dst[i], dst[i]); + } +} + +#ifdef PADDLE_WITH_CUDA +TEST(StridedMemcpy, GPUCrop) { + // clang-format off + int src[] = { + 0, 1, 2, 0, 0, + 0, 3, 4, 0, 0, + 0, 0, 0, 0, 0, + }; + // clang-format on + + platform::CUDAPlace gpu0(0); + platform::CPUPlace cpu; + + platform::CUDADeviceContext ctx(gpu0); + + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + + framework::DDim src_stride({5, 1}); + + int dst[4]; + int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({2, 1}); + + StridedMemcpy(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, + gpu_dst); + + memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); + ctx.Wait(); + + ASSERT_EQ(1, dst[0]); + ASSERT_EQ(2, dst[1]); + ASSERT_EQ(3, dst[2]); + ASSERT_EQ(4, dst[3]); + + memory::Free(gpu0, gpu_dst); + memory::Free(gpu0, gpu_src); +} + +TEST(StridedMemcpy, GPUConcat) { + // clang-format off + int src[] = { + 1, 2, + 3, 4 + }; + // clang-format on + + platform::CUDAPlace gpu0(0); + platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); + + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); + + int dst[8]; + int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + + framework::DDim src_stride({2, 1}); + framework::DDim dst_dim({2, 2}); + framework::DDim dst_stride({4, 1}); + + StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); + StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, + gpu_dst + 2); + + memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream()); + ctx.Wait(); + + // clang-format off + int expect_dst[] = { + 1, 2, 1, 2, + 3, 4, 3, 4 + }; + // clang-format on + for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { + ASSERT_EQ(expect_dst[i], dst[i]); + } + + memory::Free(gpu0, gpu_dst); + memory::Free(gpu0, gpu_src); +} + +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..96f851720aea2ddd643e1e6251fee314e26cbf95 --- /dev/null +++ b/paddle/fluid/operators/sum_op.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sum_op.h" +#include +#include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +class SumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null"); + + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SumOp should not be null."); + if (ctx->IsRuntime() && + ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarDesc::LOD_TENSOR_ARRAY) { + return; // skip runtime infershape when is tensor array; + } + + auto x_dims = ctx->GetInputsDim("X"); + size_t N = x_dims.size(); + PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); + + framework::DDim in_dim({0}); + for (auto& x_dim : x_dims) { + if (framework::product(x_dim) == 0) { + continue; + } + if (framework::product(in_dim) == 0) { + in_dim = x_dim; + } else { + PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape"); + } + } + ctx->SetOutputDim("Out", in_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto x_vars = ctx.MultiInputVar("X"); + if (x_vars[0]->IsType()) { + int dtype = -1; + for (auto& x_var : x_vars) { + auto& lod_tensor = x_var->Get(); + if (lod_tensor.numel() == 0) { + continue; + } + if (dtype == -1) { + dtype = framework::ToDataType(lod_tensor.type()); + } else { + PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); + } + } + PADDLE_ENFORCE_NE(dtype, -1, + "Sum operator should have at least one tensor"); + + return framework::OpKernelType( + static_cast(dtype), ctx.device_context()); + } else if (x_vars[0]->IsType()) { + return framework::OpKernelType( + framework::ToDataType( + x_vars[0]->Get().value().type()), + ctx.device_context()); + } else if (x_vars[0]->IsType()) { + for (auto& x_var : x_vars) { + auto& array = x_var->Get(); + for (auto& each : array) { + if (each.numel() != 0) { + return framework::OpKernelType(framework::ToDataType(each.type()), + ctx.device_context()); + } + } + } + PADDLE_THROW("Cannot find the input data type by all input data"); + } + PADDLE_THROW("Unexpected branch. Input type is %s", + x_vars[0]->Type().name()); + } +}; + +class SumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SumOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(vector) The input tensors of sum operator.") + .AsDuplicable(); + AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddComment(R"DOC( +Sum operator. + +This operators sums the input tensors. All the inputs can carry the +LoD (Level of Details) information. However, the output only shares +the LoD information with the first input. +)DOC"); + } +}; + +class SumOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto var_type = framework::proto::VarDesc::SELECTED_ROWS; + + for (auto& name : op_desc.Input("X")) { + VLOG(10) << name << " " + << block->FindRecursiveOrCreateVar(name).GetType(); + } + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [block](const std::string& name) { + return block->FindRecursiveOrCreateVar(name).GetType() == + framework::proto::VarDesc::LOD_TENSOR; + }); + + auto is_tensor_array = [block](const std::string& name) { + return block->FindRecursiveOrCreateVar(name).GetType() == + framework::proto::VarDesc::LOD_TENSOR_ARRAY; + }; + + bool any_input_is_tensor_array = + std::any_of(inputs.begin(), inputs.end(), is_tensor_array); + bool all_inputs_are_tensor_array = + std::all_of(inputs.begin(), inputs.end(), is_tensor_array); + + if (any_input_is_tensor_array) { + if (!all_inputs_are_tensor_array) { + std::ostringstream os; + for (auto& each : inputs) { + os << " " << each << " type is " + << block->FindRecursiveOrCreateVar(each).GetType() << "\n"; + } + PADDLE_ENFORCE(all_inputs_are_tensor_array, + "Not all inputs are tensor array:\n%s", os.str()); + } + var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY; + } else if (any_input_is_lod_tensor) { + var_type = framework::proto::VarDesc::LOD_TENSOR; + } + + auto out_var_name = op_desc.Output("Out").front(); + auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); + out_var.SetType(var_type); + auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); + out_var.SetDataType(in_var.GetDataType()); + } +}; + +class SumGradMaker : public framework::GradOpDescMakerBase { + public: + using framework::GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const override { + auto x_grads = InputGrad("X", false); + std::vector> grad_ops; + grad_ops.reserve(x_grads.size()); + auto og = OutputGrad("Out"); + std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops), + [&og](const std::string& x_grad) { + auto* grad_op = new framework::OpDesc(); + grad_op->SetType("scale"); + grad_op->SetInput("X", og); + grad_op->SetOutput("Out", {x_grad}); + grad_op->SetAttr("scale", 1.0f); + return std::unique_ptr(grad_op); + }); + return grad_ops; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, + ops::SumOpVarTypeInference); +REGISTER_OP_CPU_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..8d8f90d7510bae854a0507adaf8998fb7aea3b58 --- /dev/null +++ b/paddle/fluid/operators/sum_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/sum_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5e1222c6ef723a6321392a5af7fdb558c24df32b --- /dev/null +++ b/paddle/fluid/operators/sum_op.h @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; +using LoDTensor = framework::LoDTensor; +template +using EigenVector = framework::EigenVector; + +template +class SumKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto in_vars = context.MultiInputVar("X"); + int N = in_vars.size(); + auto out_var = context.OutputVar("Out"); + + bool in_place = out_var == in_vars[0]; + + if (out_var->IsType()) { + auto *out = context.Output("Out"); + if (!in_place) { + out->mutable_data(context.GetPlace()); + } + auto result = EigenVector::Flatten(*out); + if (!in_place) { + math::SetConstant constant_functor; + constant_functor(context.template device_context(), out, + 0.0); + } + + math::SelectedRowsAddToTensor functor; + auto &place = + *context.template device_context().eigen_device(); + // If in_place, just skip the first tensor + for (int i = in_place ? 1 : 0; i < N; i++) { + if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); + if (in_t.numel() == 0) { + continue; + } + auto in = EigenVector::Flatten(in_t); + result.device(place) = result + in; + } else if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); + functor(context.template device_context(), in_t, out); + } else { + PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); + } + } + } else if (out_var->IsType()) { + std::unique_ptr in0; + if (in_place) { + // If is in_place, we store the input[0] to in0 + auto &in_sel0 = in_vars[0]->Get(); + auto &rows = in_sel0.rows(); +#ifdef PADDLE_WITH_CUDA + std::vector rows_in_cpu; + rows_in_cpu.reserve(rows.size()); + for (auto item : rows) { + rows_in_cpu.push_back(item); + } + in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height())); +#else + in0.reset(new framework::SelectedRows(rows, in_sel0.height())); +#endif + in0->mutable_value()->ShareDataWith(in_sel0.value()); + } + + auto get_selected_row = [&](size_t i) -> const SelectedRows & { + if (i == 0 && in0) { + return *in0.get(); + } else { + return in_vars[i]->Get(); + } + }; + + auto *out = context.Output("Out"); + out->mutable_rows()->clear(); + auto *out_value = out->mutable_value(); + + // Runtime InferShape + size_t first_dim = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + first_dim += sel_row.rows().size(); + } + auto in_dim = + framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim[0] = static_cast(first_dim); + + out_value->Resize(framework::make_ddim(in_dim)); + out_value->mutable_data(context.GetPlace()); + + math::SelectedRowsAddTo functor; + + int64_t offset = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + + PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); + functor(context.template device_context(), sel_row, + offset, out); + offset += sel_row.value().numel(); + } + } else if (out_var->IsType()) { + auto &out_array = *out_var->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { + PADDLE_ENFORCE(in_vars[i]->IsType(), + "Only support all inputs are TensorArray"); + auto &in_array = in_vars[i]->Get(); + + for (size_t i = 0; i < in_array.size(); ++i) { + if (in_array[i].numel() != 0) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (out_array[i].numel() == 0) { + framework::Copy(in_array[i], in_array[i].place(), + context.device_context(), &out_array[i]); + out_array[i].set_lod(in_array[i].lod()); + } else { + PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); + auto in = EigenVector::Flatten(in_array[i]); + auto result = EigenVector::Flatten(out_array[i]); + result.device(*context.template device_context() + .eigen_device()) = result + in; + } + } + } + } + } else { + PADDLE_THROW("Unexpected branch, output variable type is %s", + out_var->Type().name()); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f1b7252318f8321ef394d321aedb658398e16d --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/target_assign_op.h" + +namespace paddle { +namespace operators { + +class TargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // checkout inputs + PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"), + "Input(EncodedGTBBox) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"), + "Input(GTScoreLabel) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("NegIndices"), + "Input(NegIndices) of TargetAssignOp should not be null"); + + // checkout outputs + PADDLE_ENFORCE( + ctx->HasOutput("PredBBoxLabel"), + "Output(PredBBoxLabel) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredBBoxWeight"), + "Output(PredBBoxWeight) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredScoreLabel"), + "Output(PredScoreLabel) of TargetAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("PredScoreWeight"), + "Output(PredScoreWeight) of TargetAssignOp should not be null."); + + auto blabel_dims = ctx->GetInputDim("EncodedGTBBox"); + auto slabel_dims = ctx->GetInputDim("GTScoreLabel"); + auto mi_dims = ctx->GetInputDim("MatchIndices"); + auto neg_dims = ctx->GetInputDim("NegIndices"); + + PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL, + "The rank of Input(EncodedGTBBox) must be 3."); + PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL, + "The rank of Input(GTScoreLabel) must be 2."); + PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL, + "The rank of Input(MatchIndices) must be 2."); + PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL, + "The rank of Input(NegIndices) must be 2."); + + PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0], + "The 1st dimension (means the total number of " + "ground-truth bounding boxes) of Input(EncodedGTBBox) " + "and Input(GTScoreLabel) must be the same."); + PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1], + "The 2nd dimension (means the number of priod boxes) " + "of Input(EncodedGTBBox) and " + "Input(MatchIndices) must be the same."); + PADDLE_ENFORCE_EQ(blabel_dims[2], 4, + "The 3rd dimension of Input(EncodedGTBBox) must be 4."); + + auto n = mi_dims[0]; + auto np = mi_dims[1]; + ctx->SetOutputDim("PredBBoxLabel", {n, np, 4}); + ctx->SetOutputDim("PredBBoxWeight", {n, np, 1}); + ctx->SetOutputDim("PredScoreLabel", {n, np, 1}); + ctx->SetOutputDim("PredScoreWeight", {n, np, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("EncodedGTBBox")->type()), + ctx.device_context()); + } +}; + +class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("EncodedGTBBox", + "(LoDTensor), The encoded ground-truth bounding boxes with shape " + "[Ng, Np, 4], where Ng is the total number of ground-truth boxes " + "in this mini-batch, Np the number of predictions, 4 is the " + "number of coordinate in [xmin, ymin, xmax, ymax] layout."); + AddInput("GTScoreLabel", + "(LoDTensor, default LoDTensor), The input ground-truth " + "labels with shape [Ng, 1], where the Ng is the same as it in " + "the input of EncodedGTBBox."); + AddInput("MatchIndices", + "(Tensor, default Tensor), The input matched indices " + "with shape [N, Np], where N is the batch size, Np is the same " + "as it in the input of EncodedGTBBox. If MatchIndices[i][j] " + "is -1, the j-th prior box is not matched to any ground-truh " + "box in i-th instance."); + AddInput("NegIndices", + "(LoDTensor, default LoDTensor), The input negative example " + "indices with shape [Neg, 1], where is the total number of " + "negative example indices."); + AddAttr("background_label", + "(int, default 0), Label index of background class.") + .SetDefault(0); + AddOutput("PredBBoxLabel", + "(Tensor), The output encoded ground-truth labels " + "with shape [N, Np, 4], N is the batch size and Np, 4 is the " + "same as they in input of EncodedGTBBox. If MatchIndices[i][j] " + "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth " + "box for background_label in i-th instance."); + AddOutput("PredBBoxWeight", + "(Tensor), The weight for PredBBoxLabel with the shape " + "of [N, Np, 1]"); + AddOutput("PredScoreLabel", + "(Tensor, default Tensor), The output score labels for " + "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] " + "is -1, PredScoreLabel[i][j] = background_label."); + AddOutput("PredScoreWeight", + "(Tensor), The weight for PredScoreLabel with the shape " + "of [N, Np, 1]"); + AddComment(R"DOC( +This operator is, for given the encoded boxes between prior boxes and +ground-truth boxes and ground-truth class labels, to assign classification +and regression targets to each prior box as well as weights to each +prior box. The weights is used to specify which prior box would not contribute +to training loss. + +For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`, +`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`. +Assumed that the row offset for each instance in `EncodedGTBBox` is called lod, +this operato assigns classification/regression targets by performing the +following steps: + +1. Assigning all outpts based on `MatchIndices`: + +If id = MatchIndices[i][j] > 0, + + PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j] + PredBBoxWeight[i][j] = 1. + PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id] + PredScoreWeight[i][j] = 1. + +Otherwise, + + PredBBoxLabel[j][j] = [0., 0., 0., 0.] + PredBBoxWeight[i][j] = 0. + PredScoreLabel[i][j] = background_label + PredScoreWeight[i][j] = 0. + +2. Assigning PredScoreWeight based on `NegIndices`: + +Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod, +for i-th instance and all ids of NegIndices in this instance: + + PredScoreLabel[i][id] = background_label + PredScoreWeight[i][id] = 1.0 + + )DOC"); + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int num, const int num_prior_box, + const int background_label, int* out_label, T* out_label_wt) { + for (int i = 0; i < num; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + int id = neg_indices[j]; + out_label[i * num_prior_box + id] = background_label; + out_label_wt[i * num_prior_box + id] = static_cast(1.0); + } + } + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp, + ops::TargetAssignOpMaker); +REGISTER_OP_CPU_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c012d27ad82eb62d9981c8c73ef5b8cc03adc47 --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.cu @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/target_assign_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod, + const int num, const int num_prior_box, + const int background_label, + int* out_label, T* out_label_wt) { + int bidx = blockIdx.x; + int st = lod[bidx]; + int ed = lod[bidx + 1]; + + int row_start = bidx * num_prior_box; + for (int i = st + threadIdx.x; i < ed; i += blockDim.x) { + int id = row_start + neg_indices[i]; + out_label[id] = background_label; + out_label_wt[id] = 1.; + } +} + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const int* neg_indices, const size_t* lod, const int num, + const int num_prior_box, const int background_label, + int* out_label, T* out_label_wt) { + const int block_size = 256; + const int grid_size = num; + NegTargetAssignKernel<<>>( + neg_indices, lod, num, num_prior_box, background_label, out_label, + out_label_wt); + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/target_assign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..876111523af51ce67e804d6646f404c45c00af12 --- /dev/null +++ b/paddle/fluid/operators/target_assign_op.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct TargetAssignFunctor { + const T* gt_box_; + const int* gt_label_; + const int* match_indices_; + const size_t* lod_; + const int background_label_; + const int64_t num_; + const int64_t num_prior_box_; + + T* out_box_; + T* out_box_wt_; + int* out_label_; + T* out_label_wt_; + + TargetAssignFunctor(const T* gt_box, const int* gt_label, + const int* match_indices, const size_t* lod, + const int background_label, const int64_t num, + const int64_t np, T* out_box, T* out_box_wt, + int* out_label, T* out_label_wt) + : gt_box_(gt_box), + gt_label_(gt_label), + match_indices_(match_indices), + lod_(lod), + background_label_(background_label), + num_(num), + num_prior_box_(np), + out_box_(out_box), + out_box_wt_(out_box_wt), + out_label_(out_label), + out_label_wt_(out_label_wt) {} + + HOSTDEVICE void operator()(size_t i) const { + int row = i / num_prior_box_; + int col = i - row * num_prior_box_; + + size_t row_off = lod_[row]; + int offset = row * num_prior_box_ + col; + + int id = match_indices_[offset]; + T* obox = out_box_ + offset * 4; + int* olabel = out_label_ + offset; + T* obox_wt = out_box_wt_ + offset; + T* olabel_wt = out_label_wt_ + offset; + + if (id > -1) { + const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4; + + obox[0] = gtbox[0]; + obox[1] = gtbox[1]; + obox[2] = gtbox[2]; + obox[3] = gtbox[3]; + + olabel[0] = gt_label_[row_off + id]; + obox_wt[0] = static_cast(1.); + olabel_wt[0] = static_cast(1.); + } else { + obox[0] = static_cast(0.); + obox[1] = static_cast(0.); + obox[2] = static_cast(0.); + obox[3] = static_cast(0.); + + olabel[0] = background_label_; + obox_wt[0] = static_cast(0.); + olabel_wt[0] = static_cast(0.); + } + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::DeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int num, const int num_prior_box, + const int background_label, int* out_label, + T* out_label_wt) const; +}; + +template +class TargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* enc_gt_box = ctx.Input("EncodedGTBBox"); + auto* gt_label = ctx.Input("GTScoreLabel"); + auto* match_indices = ctx.Input("MatchIndices"); + auto* neg_indices = ctx.Input("NegIndices"); + + auto* out_box = ctx.Output("PredBBoxLabel"); + auto* out_box_wt = ctx.Output("PredBBoxWeight"); + auto* out_label = ctx.Output("PredScoreLabel"); + auto* out_label_wt = ctx.Output("PredScoreWeight"); + + PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL); + PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL); + PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL); + + int background_label = ctx.Attr("background_label"); + + const T* box_data = enc_gt_box->data(); + const int* label_data = gt_label->data(); + const int* match_idx_data = match_indices->data(); + const int* neg_idx_data = neg_indices->data(); + + T* obox_data = out_box->mutable_data(ctx.GetPlace()); + T* obox_wt_data = out_box_wt->mutable_data(ctx.GetPlace()); + int* olabel_data = out_label->mutable_data(ctx.GetPlace()); + T* olabel_wt_data = out_label_wt->mutable_data(ctx.GetPlace()); + + int64_t num = match_indices->dims()[0]; + int64_t num_prior_box = match_indices->dims()[1]; + + auto gt_lod = enc_gt_box->lod().back(); + auto gt_label_lod = gt_label->lod().back(); + auto neg_lod = neg_indices->lod().back(); + for (size_t i = 0; i < gt_lod.size(); ++i) { + PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]); + } + + size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace()); + size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); + + TargetAssignFunctor functor(box_data, label_data, match_idx_data, + gt_lod_data, background_label, num, + num_prior_box, obox_data, obox_wt_data, + olabel_data, olabel_wt_data); + + auto& device_ctx = ctx.template device_context(); + platform::ForRange for_range(device_ctx, + num * num_prior_box); + for_range(functor); + + NegTargetAssignFunctor neg_trg_functor; + neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box, + background_label, olabel_data, olabel_wt_data); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..50811fb22491598849216f41a584ae0b68f8f306 --- /dev/null +++ b/paddle/fluid/operators/tensor_array_read_write_op.cc @@ -0,0 +1,220 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/array_operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +namespace paddle { +namespace operators { + +class WriteToArrayOp : public ArrayOp { + public: + WriteToArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) return; + auto &x_tensor = x->Get(); + size_t offset = GetOffset(scope, place); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + if (offset >= out->size()) { + VLOG(10) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; + out->resize(offset + 1); + } + if (x_tensor.memory_size() > 0) { + auto *out_tensor = &out->at(offset); + + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + Copy(x_tensor, place, dev_ctx, out_tensor); + out_tensor->set_lod(x_tensor.lod()); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << offset << "]."; + } + } +}; + +class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) the tensor will be written to tensor array"); + AddInput( + "I", + "(Tensor) the subscript index in tensor array. The number of element " + "should be 1"); + AddOutput("Out", "(TensorArray) the tensor array will be written"); + AddComment(R"DOC( +WriteToArray Operator. + +This operator writes a LoDTensor to a LoDTensor array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$A[i] = T$$ + +)DOC"); + } +}; + +class WriteToArrayInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index"); + PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1, + "The number of element of subscript index must be 1"); + if (!context->HasInput("X")) { + return; + } + PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError()); + context->SetOutputDim("Out", context->GetInputDim("X")); + } + + protected: + virtual const char *NotHasXError() const { return "Must set the lod tensor"; } + + virtual const char *NotHasOutError() const { + return "Must set the lod tensor array"; + } +}; + +class WriteToArrayInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto x_name = op_desc.Input("X")[0]; + auto out_name = op_desc.Output("Out")[0]; + VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + auto &out = block->FindRecursiveOrCreateVar(out_name); + out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); + auto *x = block->FindVarRecursive(x_name); + if (x != nullptr) { + out.SetDataType(x->GetDataType()); + } + } +}; + +class ReadFromArrayOp : public ArrayOp { + public: + ReadFromArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x != nullptr, "X must be set"); + auto &x_array = x->Get(); + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE(out != nullptr, "Out must be set"); + size_t offset = GetOffset(scope, place); + if (offset < x_array.size()) { + auto *out_tensor = out->GetMutable(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Copy(x_array[offset], place, dev_ctx, out_tensor); + out_tensor->set_lod(x_array[offset].lod()); + } else { + VLOG(10) << "offset " << offset << " >= " << x_array.size(); + } + } +}; + +class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(TensorArray) the array will be read from."); + AddInput("I", + "(Tensor) the subscript index in tensor array. The number of " + "element should be 1"); + AddOutput("Out", "(LoDTensor) the tensor will be read from."); + AddComment(R"DOC( +ReadFromArray Operator. + +Read a LoDTensor from a LoDTensor Array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$T = A[i]$$ + +)DOC"); + } +}; + +class ReadFromArrayInferShape : public WriteToArrayInferShape { + protected: + const char *NotHasXError() const override { + return "The input array X must be set"; + } + const char *NotHasOutError() const override { + return "The output tensor out must be set"; + } +}; + +class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("read_from_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("write_to_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp, + ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker, + ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType); +REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp, + ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker, + ops::ReadFromArrayGradMaker); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c81ea860d0c9fa5498de2d149e3d05d080ad729f --- /dev/null +++ b/paddle/fluid/operators/top_k_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/top_k_op.h" + +namespace paddle { +namespace operators { + +class TopkOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Indices"), + "Output(Indices) of TopkOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + const int k = static_cast(ctx->Attrs().Get("k")); + + PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); + PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); + PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, + "input must have >= k columns"); + + framework::DDim dims = input_dims; + dims[dims.size() - 1] = k; + ctx->SetOutputDim("Out", dims); + ctx->SetOutputDim("Indices", dims); + ctx->ShareLoD("X", "Out"); + ctx->ShareLoD("X", "Indices"); + } +}; + +class TopkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); + AddComment(R"DOC( +Top K operator + +If the input is a vector (1d tensor), this operator finds the k largest +entries in the vector and outputs their values and indices as vectors. +Thus values[j] is the j-th largest entry in input, and its index is indices[j]. + +For matrices, this operator computes the top k entries in each row. )DOC"); + AddAttr("k", + "(int, default 1) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .SetDefault(1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(top_k, + ops::TopkKernel); diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5390cb5063bcc302f5ff9cfe96bd421b477eeb3f --- /dev/null +++ b/paddle/fluid/operators/top_k_op.cu @@ -0,0 +1,320 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Pair { + __device__ __forceinline__ Pair() {} + __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} + + __device__ __forceinline__ void set(T value, int64_t id) { + v = value; + id = id; + } + + __device__ __forceinline__ void operator=(const Pair& in) { + v = in.v; + id = in.id; + } + + __device__ __forceinline__ bool operator<(const T value) const { + return (v < value); + } + + __device__ __forceinline__ bool operator<(const Pair& in) const { + return (v < in.v) || ((v == in.v) && (id > in.id)); + } + + __device__ __forceinline__ bool operator>(const Pair& in) const { + return (v > in.v) || ((v == in.v) && (id < in.id)); + } + + T v; + int64_t id; +}; + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, + int beam_size) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p) { + for (int k = beam_size - 2; k >= 0; k--) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + AddTo(topk, tmp, beam_size); + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, + int idx, int dim, const Pair& max, + int beam_size) { + while (idx < dim) { + if (topk[beam_size - 1] < val[idx]) { + Pair tmp(val[idx], col[idx]); + if (tmp < max) { + AddTo(topk, tmp, beam_size); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* src, + bool& firstStep, bool& is_empty, + Pair& max, int dim, + const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, src, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, src, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, + int beam_size, const T* val, + int* col, bool& firstStep, + bool& is_empty, Pair& max, + int dim, const int tid) { + if (beam > 0) { + int length = beam < beam_size ? beam : beam_size; + if (firstStep) { + firstStep = false; + GetTopK(topk, val, col, tid, dim, length); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - beam) { + topk[k] = topk[k + beam]; + } else { + topk[k].set(-INFINITY, -1); + } + } + if (!is_empty) { + GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + length); + } + } + + max = topk[MaxLength - 1]; + if (max.v == -1) is_empty = true; + beam = 0; + } +} + +template +__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, + Pair topk[], T** topVal, + int64_t** topIds, int& beam, int& k, + const int tid, const int warp) { + while (true) { + __syncthreads(); + if (tid < BlockSize / 2) { + if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } + __syncthreads(); + for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { + if (tid < stride) { + if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } + __syncthreads(); + } + __syncthreads(); + + if (tid == 0) { + **topVal = sh_topk[maxid[0]].v; + **topIds = sh_topk[maxid[0]].id; + (*topVal)++; + (*topIds)++; + } + if (tid == maxid[0]) beam++; + if (--k == 0) break; + __syncthreads(); + + if (tid == maxid[0]) { + if (beam < MaxLength) { + sh_topk[tid] = topk[beam]; + } + } + if (maxid[0] / 32 == warp) { + if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + } + } +} + +/** + * Each block compute one sample. + * In a block: + * 1. every thread get top MaxLength value; + * 2. merge to sh_topk, block reduce and get max value; + * 3. go to the second setp, until one thread's topk value is null; + * 4. go to the first setp, until get the topk value. + */ +template +__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, + const T* src, int lds, int dim, int k) { + __shared__ Pair sh_topk[BlockSize]; + __shared__ int maxid[BlockSize / 2]; + const int tid = threadIdx.x; + const int warp = threadIdx.x / 32; + output += blockIdx.x * output_stride; + indices += blockIdx.x * k; + + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int k = 0; k < MaxLength; k++) { + topk[k].set(-INFINITY, -1); + } + while (k) { + ThreadGetTopK(topk, beam, k, + src + blockIdx.x * lds, firststep, + is_empty, max, dim, tid); + + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &output, + &indices, beam, k, tid, warp); + } +} + +template +class TopkOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + size_t k = static_cast(ctx.Attr("k")); + + const T* input_data = input->data(); + + T* output_data = output->mutable_data(ctx.GetPlace()); + // FIXME(typhoonzero): data is always converted to type T? + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + size_t input_height = input->dims()[0]; + size_t input_width = input->dims()[1]; + if (k > input_width) k = input_width; + + // NOTE: pass lds and dim same to input width. + // NOTE: old matrix implementation of stride is different to eigen. + // TODO(typhoonzero): refine this kernel. + dim3 threads(256, 1); + dim3 grid(input_height, 1); + + KeMatrixTopK<<< + grid, threads, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(output_data, output->dims()[1], + indices_data, input_data, + input_width, input_width, int(k)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e32b35150070b30c3ccbbb9483c1f24b1d205919 --- /dev/null +++ b/paddle/fluid/operators/top_k_op.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class TopkKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Get the top k elements of each row of input tensor + // FIXME: only deal with matrix(2d tensor). + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + // k is determined by Attr + const size_t k = static_cast(ctx.Attr("k")); + + T* output_data = output->mutable_data(ctx.GetPlace()); + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + auto eg_input = EigenMatrix::From(*input); + + // reshape input to a flattern matrix(like flat_inner_dims) + framework::DDim inputdims = input->dims(); + const size_t row = framework::product( + framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); + const size_t col = inputdims[inputdims.size() - 1]; + Eigen::DSizes flat2dims(row, col); + // NOTE: eigen shape doesn't affect paddle tensor. + eg_input.reshape(flat2dims); + + for (size_t i = 0; i < row; i++) { + std::vector> vec; + for (size_t j = 0; j < col; j++) { + vec.push_back(std::pair(eg_input(i, j), j)); + } + + std::partial_sort( + vec.begin(), vec.begin() + k, vec.end(), + [](const std::pair& l, const std::pair& r) { + return l.first > r.first; + }); + for (size_t j = 0; j < k; j++) { + output_data[i * k + j] = vec[j].first; + indices_data[i * k + j] = int64_t(vec[j].second); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3d8acffc269c1abe98d2de39dcf09fbf3d825f3 --- /dev/null +++ b/paddle/fluid/operators/transpose_op.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class TransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + std::vector axis = ctx->Attrs().Get>("axis"); + size_t x_rank = x_dims.size(); + size_t axis_size = axis.size(); + + PADDLE_ENFORCE_EQ(x_rank, axis_size, + "The input tensor's rank(%d) " + "should be equal to the axis's size(%d)", + x_rank, axis_size); + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + "Each element of Attribute axis should be a unique value " + "range from 0 to (dims - 1), " + "where the dims is the axis's size"); + } + + framework::DDim out_dims(x_dims); + for (size_t i = 0; i < axis_size; i++) { + out_dims[i] = x_dims[axis[i]]; + } + ctx->SetOutputDim("Out", out_dims); + } +}; + +class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor, tensors with rank up to 6 are supported."); + AddOutput("Out", "(Tensor)The output tensor."); + AddAttr>( + "axis", + "(vector) A list of values, and the size of the list should be " + "the same with the input tensor rank. This operator permutes the input " + "tensor's axes according to the values given."); + AddComment(R"DOC( +Transpose Operator. + +The input tensor will be permuted according to the axes given. +The behavior of this operator is similar to how `numpy.transpose` works. + +- suppose the input `X` is a 2-D tensor: + $$ + X = \begin{pmatrix} + 0 &1 &2 \\ + 3 &4 &5 + \end{pmatrix}$$ + + the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis) + + then the output $Y$ is: + + $$ + Y = \begin{pmatrix} + 0 &3 \\ + 1 &4 \\ + 2 &5 + \end{pmatrix}$$ + +- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is +$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$. + +)DOC"); + } +}; + +class TransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad, + ops::TransposeOpGrad); +REGISTER_OP_CPU_KERNEL( + transpose, ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8667ab369e1de9c2b74ba902242355d1660d24a --- /dev/null +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + transpose, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( + transpose_grad, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1fb419474ab078efa64523454bbbbb6176a58d40 --- /dev/null +++ b/paddle/fluid/operators/transpose_op.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +inline void TransCompute(const int dim, const DeviceContext& dev_ctx, + const framework::Tensor& in, framework::Tensor* out, + const std::vector& axis) { + switch (dim) { + case 1: + math::Transpose trans1; + trans1(dev_ctx, in, out, axis); + break; + case 2: + math::Transpose trans2; + trans2(dev_ctx, in, out, axis); + break; + case 3: + math::Transpose trans3; + trans3(dev_ctx, in, out, axis); + break; + case 4: + math::Transpose trans4; + trans4(dev_ctx, in, out, axis); + break; + case 5: + math::Transpose trans5; + trans5(dev_ctx, in, out, axis); + break; + case 6: + math::Transpose trans6; + trans6(dev_ctx, in, out, axis); + break; + default: + PADDLE_THROW("Tensors with rank at most 6 are supported"); + } +} + +template +class TransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + std::vector axis = context.Attr>("axis"); + int ndims = axis.size(); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); + } +}; + +template +class TransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out_grad = + context.Input(framework::GradVarName("Out")); + auto* x_grad = + context.Output(framework::GradVarName("X")); + if (!x_grad) return; + + x_grad->mutable_data(context.GetPlace()); + std::vector axis = context.Attr>("axis"); + std::vector reversed_axis(axis); + + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + + int ndims = axis.size(); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, + reversed_axis); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6fea1d4485fd7f88375c96511c85396b707bf1c --- /dev/null +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class CPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* tensor = ctx.Output("Out"); + T* data = tensor->mutable_data(ctx.GetPlace()); + unsigned int seed = static_cast(ctx.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::uniform_real_distribution dist( + static_cast(ctx.Attr("min")), + static_cast(ctx.Attr("max"))); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + } +}; + +class UniformRandomOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UniformRandomOp should not be null."); + + PADDLE_ENFORCE( + ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), + "uniform_random's min must less then max"); + auto& shape = ctx->Attrs().Get>("shape"); + std::vector temp; + temp.reserve(shape.size()); + for (auto dim : shape) { + temp.push_back(static_cast(dim)); + } + ctx->SetOutputDim("Out", framework::make_ddim(temp)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.GetPlace()); + } +}; + +class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { + public: + UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("Out", "(Tensor) The output tensor of uniform random op"); + AddComment(R"DOC( +Uniform random operator. + +This operator initializes a tensor with random values sampled from a +uniform distribution. + +)DOC"); + AddAttr>("shape", + "(vector) The shape of the output tensor"); + AddAttr("min", + "(float, default -1.0) " + "Minimum value of uniform random") + .SetDefault(-1.0f); + AddAttr("max", + "(float, default 1.0) " + "Maximun value of uniform random") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed used for generating samples. " + "0 means use a seed generated by the system.") + .SetDefault(0); + AddAttr("dtype", "(int, default 5(FP32)) Output tensor data type") + .SetDefault(framework::proto::DataType::FP32); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, + paddle::operators::UniformRandomOpMaker); +REGISTER_OP_CPU_KERNEL(uniform_random, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9afca68e59f8c0eedaf38be4b51343b9cb043f65 --- /dev/null +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + + __host__ __device__ UniformGenerator(T min, T max, int seed) + : min_(min), max_(max), seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + return dist(rng); + } +}; + +// It seems that Eigen::Tensor::random in GPU will SEGFAULT. +// Use std::random and thrust::random(thrust is a std library in CUDA) to +// implement uniform random. +template +class GPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); + if (seed == 0) { + std::random_device rd; + seed = rd(); + } + T min = static_cast(context.Attr("min")); + T max = static_cast(context.Attr("max")); + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + UniformGenerator(min, max, seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e0b271fed69772a53a776f290d524565df3dc94 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unpool_op.h" +namespace paddle { +namespace operators { + +class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of unpool operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "Indices", + "(Tensor) The input tensor of the indices given out by MaxPool2d. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of unpool operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + AddAttr>( + "ksize", + "(vector), the unpooling window size(height, width) " + "of unpooling operator."); + AddAttr>("strides", + "(vector, default:{1, 1}), " + "strides (height, width) of unpooling operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector defalut:{0,0}), " + "paddings (height, width) of unpooling operator.") + .SetDefault({0, 0}); + AddAttr( + "unpooling_type", + "(string), unpooling type, can be \"max\" for max-unpooling ") + .InEnum({"max"}); + AddComment(R"DOC( +Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: +$(N, C_{out}, H_{out}, W_{out})$, where +$$ +H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ +W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] +$$ +Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf +)DOC"); + } +}; + +int OutputSize(int input_size, int ksize, int padding, int stride) { + int output_size = (input_size - 1) * stride - 2 * padding + ksize; + return output_size; +} + +class UnpoolOp : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input(Indices) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UnpoolOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + auto in_y_dims = ctx->GetInputDim("Indices"); + std::string unpooling_type = + ctx->Attrs().Get("unpooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Unpooling intput must be of 4-dimensional."); + PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class UnpoolOpGrad : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, + ops::UnpoolOpGrad); +REGISTER_OP_CPU_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..15d81eb296ba35d9f67426083870c1a83ff66ee5 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unpool_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ceed5507391b40491d4a26bbc51e8c861e1bf1c2 --- /dev/null +++ b/paddle/fluid/operators/unpool_op.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/unpooling.h" + +namespace paddle { +namespace operators { +template +class UnpoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Indices"); + auto* out = context.Output("Out"); + std::string unpooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + T* output_data = out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + if (output_data) { + math::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); + } +}; +template +class UnpoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Indices"); + const framework::Tensor* out = context.Input("Out"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + std::string unpooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + auto& device_ctx = context.template device_context(); + math::SetConstant zero; + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0)); + } + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c05fed0b47c3bb3582e4b261ef188146d41820e --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/warpctc_op.h" + +namespace paddle { +namespace operators { + +class WarpCTCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"), + "Output(WarpCTCGrad) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of WarpCTCOp should not be null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + int sequence_width = + static_cast(framework::product(logits_dims) / logits_dims[0]); + int blank = ctx->Attrs().Get("blank"); + PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width), + "The value of Attr(blank) should be in interval [0, %d).", + sequence_width); + // TODO(liuyiqun): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Loss", {logits_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "(LodTensor, default: LoDTensor), the unscaled " + "probabilities of variable-length sequences, which is a 2-D " + "Tensor with LoD information. It's shape is " + "[Lp, num_classes + 1], where Lp is the sum of all input " + "sequences' length and num_classes is the true number of classes " + "(not including the blank label)."); + AddInput("Label", + "(LodTensor, default: LoDTensor), the ground truth " + "of variable-length sequence, which is a 2-D Tensor with LoD " + "information. It is of the shape [Lg, 1], where Lg is th sum of " + "all labels' length."); + AddOutput("WarpCTCGrad", + "(Tensor, default: Tensor), a temporary " + "output Tensor to store the gradients of warp-ctc, which is " + "computed with loss together in one call. It is a 3-D Tensor of " + "the shape [max_sequence_length, batch_size, num_classes + 1].") + .AsIntermediate(); + AddOutput("Loss", + "(Tensor, default: Tensor), the Connectionist " + "Temporal Classification (CTC) loss, which is a 2-D Tensor of " + "the shape [batch_size, 1]"); + AddAttr("blank", + "(int, default: 0), the blank label of Connectionist " + "Temporal Classification (CTC) loss, which is in the " + "half-opened interval [0, num_classes + 1).") + .SetDefault(0); + AddAttr("norm_by_times", + "(bool, default: false), whether to " + "normalize the gradients by the number of time-step, " + "which is also the sequence's length.") + .SetDefault(false); + AddComment(R"DOC( +An operator integrating the open-source +[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in +[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin]( +https://arxiv.org/pdf/1512.02595v1.pdf), +to compute Connectionist Temporal Classification (CTC) loss. +It can be aliased as softmax with ctc, since a native softmax activation is +interated to the warp-ctc library, to to normlize values for each row of the +input tensor. + +More detail of CTC loss can be found by refering to +[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with +Recurrent Neural Networks]( +http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf). +)DOC"); + } +}; + +class WarpCTCGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"), + "Input(WarpCTCGrad) of WarpCTCGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@GRAD) of WarpCTCGradOp should not be null."); + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Logits")); + ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad, + ops::WarpCTCGradOp); +REGISTER_OP_CPU_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CPU_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ee7f970a9a5f7deaab8a98278cf1a7c051cfbd2 --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/warpctc_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CUDA_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a1de71627ee7e511a03231c11831cf54755a145e --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" +#include "paddle/fluid/operators/math/sequence_scale.h" +#include "paddle/fluid/platform/dynload/warpctc.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class WarpCTCFunctor { + public: + /* + * \brief Compute the connectionist temporal classification loss, + * and optionally compute the gradient with respect to the inputs. + * + * If gradient is nullptr, it only computes the ctc loss, + * or computes both ctc loss and gradient. + * + * \param ctx execution context of this functor + * \param input batch matrix of input probabilities, in + * max_sequence_length x num_sequences x + * sequence_width, (row-major) format + * \param gradient batch matrix of gradient, with the same shape as + * input. + * \param cpu_labels labels always in CPU memory. + * \param cpu_label_lengths length of all labels in CPU memory. + * \param cpu_input_lengths length of all sequences in CPU memory. + * \param sequence_width number of possible output symbols. + * \param num_sequences number of sequence. + * \param blank blank label used in ctc loss function. + * \param cpu_losss cost of each sequence in CPU memory. + */ + void operator()(const framework::ExecutionContext& ctx, const float* input, + float* gradient, const int* cpu_labels, + const int* cpu_label_lengths, const int* cpu_input_lengths, + const size_t sequence_width, const size_t num_sequences, + const size_t blank, float* cpu_loss) { + // Init warp-ctc options + init(ctx, blank); + + // Compute the required workspace size. + // There is no memory allocated operations within warp-ctc. + size_t workspace_bytes = 0; + ctcStatus_t status = platform::dynload::get_workspace_size( + cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), + static_cast(num_sequences), options_, &workspace_bytes); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in get_workspace_size: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + PADDLE_ENFORCE_GT(workspace_bytes, 0UL, + "Bytes of workspace got by warp-ctc function, " + "get_workspace_size(), should be larger than 0."); + + Tensor workspace; + size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL; + float* workspace_data = workspace.mutable_data( + framework::make_ddim({static_cast(workspace_elements)}), + ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), &workspace, + static_cast(0)); + + // compute loss and gradient + status = platform::dynload::compute_ctc_loss( + input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths, + static_cast(sequence_width), static_cast(num_sequences), + cpu_loss, workspace_data, options_); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in compute_ctc_loss: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + } + + protected: + void init(const framework::ExecutionContext& ctx, const size_t blank) { + warpctc_version_ = platform::dynload::get_warpctc_version(); + + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + options_.loc = CTC_GPU; + options_.stream = reinterpret_cast( + ctx.device_context()) + .stream(); +#else + PADDLE_THROW("[warpctc init] GPU is not enabled."); +#endif + } else { + options_.loc = CTC_CPU; + options_.num_threads = 1; + } + + options_.blank_label = blank; + } + + private: + int warpctc_version_; + ctcOptions options_; +}; + +template +class WarpCTCKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* logits = ctx.Input("Logits"); + auto* label = ctx.Input("Label"); + auto* warpctc_grad = ctx.Output("WarpCTCGrad"); + auto* loss = ctx.Output("Loss"); + + const size_t level = 0; + + auto logits_lod = framework::ToAbsOffset(logits->lod()); + auto logits_dims = logits->dims(); + PADDLE_ENFORCE_EQ(logits_dims[0], + static_cast(logits_lod[level].back()), + "The first dimension of Input(Logits) should be equal to " + "the sum of all sequences' lengths."); + + auto label_lod = framework::ToAbsOffset(label->lod()); + auto label_dims = label->dims(); + PADDLE_ENFORCE_EQ( + label_dims[0], label->numel(), + "The width of each timestep in Input(Label) should be 1."); + + const size_t num_sequences = logits_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, + "The number of sequences of Input(Logits) should be " + "equal to that of Input(Label)."); + + const size_t sequence_width = logits->numel() / logits_dims[0]; + auto loss_dims = + framework::make_ddim({static_cast(num_sequences), 1}); + + // warpctc needs sequences data stored in transposed padding format + Tensor warpctc_logits; + const size_t max_sequence_length = + math::MaximumSequenceLength(logits_lod, level); + auto warpctc_logits_dims = + framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *logits, warpctc_logits, + false); + const T* warpctc_logits_data = warpctc_logits.data(); + + std::vector warpctc_label_lengths(num_sequences); + std::vector warpctc_logits_lengths(num_sequences); + + for (size_t i = 0; i < num_sequences; ++i) { + warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; + warpctc_logits_lengths[i] = + logits_lod[level][i + 1] - logits_lod[level][i]; + } + + // warpctc computes loss and gradient in one call, gradient data also stored + // in batch format + T* warpctc_grad_data = + warpctc_grad->mutable_data(warpctc_logits.dims(), ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), warpctc_grad, + static_cast(0)); + + // warpctc accesses labels in CPU memory + Tensor warpctc_label; + Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label); + const int* warpctc_label_data = warpctc_label.data(); + // warpctc stores loss in CPU memory + Tensor warpctc_loss; + T* warpctc_loss_data = + warpctc_loss.mutable_data(loss_dims, platform::CPUPlace()); + + const size_t blank = static_cast(ctx.Attr("blank")); + + WarpCTCFunctor()( + ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + sequence_width, num_sequences, blank, warpctc_loss_data); + + // Copy the loss back + Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss); + } +}; + +template +class WarpCTCGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); + const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); + + logits_grad->mutable_data(ctx.GetPlace()); + bool norm_by_times = ctx.Attr("norm_by_times"); + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *logits_grad, + *warpctc_grad, norm_by_times); + + const T* loss_grad_data = loss_grad->data(); + math::ScaleLoDTensorFunctor()( + ctx.template device_context(), *logits_grad, + loss_grad_data); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d254c572acff52d967e551c377b3b32b05c92973 --- /dev/null +++ b/paddle/fluid/operators/while_op.cc @@ -0,0 +1,352 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +using StepScopeVar = std::vector; +using LoDTensor = framework::LoDTensor; + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; + +class WhileOp : public framework::OperatorBase { + public: + WhileOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); + auto &cond = scope.FindVar(Input(kCondition))->Get(); + PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); + + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + auto step_scopes = + scope.FindVar(Output(kStepScopes))->GetMutable(); + + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); + while (cond.data()[0]) { + auto ¤t_scope = scope.NewScope(); + step_scopes->push_back(¤t_scope); + + executor.Run(*program, ¤t_scope, block->ID(), + false /*create_local_scope*/); + } + } +}; + +class WhileOpMaker : public framework::OpProtoAndCheckerMaker { + public: + WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kX, + "A set of variables, which are required by operators inside the " + "block of While Op.") + .AsDuplicable(); + AddInput( + kCondition, + "(Bool) An scalar. When it's False, the While Op will be terminated.") + .AsDuplicable(); + AddOutput(kOutputs, + "A set of variables, which will be assigned with values " + "generated by the operators inside the block of While Op.") + .AsDuplicable(); + AddOutput(kStepScopes, + "(StepScopeVar) A vector of local scope, which size equals the " + "step number of While Op. The i'th scope storages temporary " + "variables generated in the i'th step."); + AddAttr(kStepBlock, + "The step block inside WhileOp"); + AddComment(R"DOC( +)DOC"); + } +}; + +class WhileGradOp : public framework::OperatorBase { + public: + WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + void Run(const framework::Scope &scope, + const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + auto *program = block->Program(); + + auto *step_scopes = + scope.FindVar(Input(kStepScopes))->GetMutable(); + + auto outside_og_names = Inputs(framework::GradVarName(kOutputs)); + auto inside_og_names = + Attr>("original_output_grad"); + + PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size()); + + for (auto cur_scope_iter = step_scopes->rbegin(); + cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { + VLOG(3) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); + framework::Scope &cur_scope = **cur_scope_iter; + // Link OG from outside to inside + for (size_t i = 0; i < outside_og_names.size(); ++i) { + auto outside_og_name = outside_og_names[i]; + auto inside_og_name = inside_og_names[i]; + VLOG(8) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; + auto &og_outside = + detail::Ref(scope.FindVar(outside_og_name), + "Cannot find Outside Gradient %s", outside_og_name); + auto &og_inside = + detail::Ref(cur_scope.Var(inside_og_name), + "Cannot find inside gradient %s", inside_og_name); + if (og_outside.Type().hash_code() == + typeid(framework::LoDTensor).hash_code()) { + auto &outside_tensor = og_outside.Get(); + auto &inside_tensor = + detail::Ref(og_inside.GetMutable()); + inside_tensor.set_lod(outside_tensor.lod()); + inside_tensor.ShareDataWith(outside_tensor); + } else if (og_outside.Type().hash_code() == + typeid(framework::LoDTensorArray).hash_code()) { + auto &outside_array = og_outside.Get(); + auto &inside_array = + detail::Ref(og_inside.GetMutable()); + VLOG(8) << outside_og_name << " size = " << outside_array.size(); + inside_array.resize(outside_array.size()); + + for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(8) << j << " " << outside_array[j].numel(); + if (outside_array[j].numel() != 0) { + inside_array[j].set_lod(outside_array[j].lod()); + inside_array[j].ShareDataWith(outside_array[j]); + } else { + PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); + } + } + } + } + + executor.Run(*program, *cur_scope_iter, block->ID(), false); + + auto &pg_names = Outputs(kXGRAD); + auto &p_names = Inputs(kX); + PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + if (pg_names[param_id] == framework::kEmptyVarName) { + continue; // parameter doesn't have gradient + } + auto inside_grad_name = framework::GradVarName(p_names[param_id]); + + // // TODO(tonyyang-svail): Not sure we need the following + // // If does not compute gradient of that variable inside rnn, + // just + // // continue + // if (local_var_names.find(inside_grad_name) == + // local_var_names.end()) { + // continue; + // } + + // zero gradient variable in step 0 + if (cur_scope_iter == step_scopes->rbegin()) { + auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); + if (var->IsType()) { + auto &inside_tensor = var->Get(); + framework::AttributeMap attrs; + attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["value"] = 0.0f; + + auto var_name = pg_names[param_id]; + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", framework::VariableNameMap{}, + {{"Out", {var_name}}}, attrs); + zero_op->Run(scope, dev_place); + scope.FindVar(var_name) + ->GetMutable() + ->set_lod(inside_tensor.lod()); + } + } + + auto new_inside_name = cur_scope.Rename(inside_grad_name); + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + sum_op->Run(cur_scope, dev_place); + cur_scope.Rename(new_inside_name, inside_grad_name); + } + dev_ctx.Wait(); + const_cast(scope).DeleteScope(&cur_scope); + } + } +}; + +class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *while_grad = new framework::OpDesc(); + while_grad->SetType("while_grad"); + while_grad->SetInput(kX, Input(kX)); + while_grad->SetInput(kOutputs, Output(kOutputs)); + while_grad->SetInput(kStepScopes, Output(kStepScopes)); + + auto *grad_block = this->grad_block_[0]; + auto *fwd_block = grad_block->ParentBlock(); + + // Not all of IGs will be generated by inner gradient operators of while op. + // Ignore IGs that is not generated by the inside block. + std::unordered_set inner_op_outputs; + for (const auto *op : grad_block->AllOps()) { + for (auto &oname : op->OutputArgumentNames()) { + inner_op_outputs.insert(oname); + } + } + auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); + for (auto &each_ig : igs) { + if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) { + VLOG(8) << "Ignore " << each_ig; + each_ig = framework::kEmptyVarName; + } + } + while_grad->SetOutput(framework::GradVarName(kX), igs); + + // OG should be re-calculated by step blocks, since many outputs of while op + // do not need to calculate gradients. + std::unordered_set block_ins; + block_ins.reserve(Input(kX).size() + Output(kOutputs).size()); + for (auto &p : Input(kX)) { + block_ins.insert(p); + } + for (auto &o : Output(kOutputs)) { + block_ins.insert(o); + } + std::unordered_set extra_inputs; + for (const auto *op : grad_block->AllOps()) { + for (auto &input_name : op->InputArgumentNames()) { + // If the input of Op has been recorded or is generated by the forward + // block, do not make it as input again. + if (block_ins.find(input_name) != block_ins.end() || + fwd_block->FindVar(input_name) != nullptr) { + continue; + } + extra_inputs.insert(input_name); + } + for (auto &output_name : op->OutputArgumentNames()) { + block_ins.insert(output_name); + } + } + + std::vector extra_inputs_list; + extra_inputs_list.resize(extra_inputs.size()); + std::copy(extra_inputs.begin(), extra_inputs.end(), + extra_inputs_list.begin()); + while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list); + + while_grad->SetAttrMap(this->Attrs()); + while_grad->SetBlockAttr(kStepBlock, *grad_block); + // record the original output gradient names, since the gradient name of + // while operator could be renamed. + while_grad->SetAttr("original_output_grad", extra_inputs_list); + + return std::unique_ptr(while_grad); + } +}; + +class WhileGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto p_names = op_desc.Input(kX); + auto pg_names = op_desc.Output(framework::GradVarName(kX)); + + for (size_t i = 0; i < p_names.size(); ++i) { + auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); + auto *g_var = block->FindVarRecursive(pg_names[i]); + if (g_var != nullptr) { // Gradient could be @EMPTY@ + VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i] + << " type: " << p_var.GetType(); + g_var->SetType(p_var.GetType()); + g_var->SetDataType(p_var.GetDataType()); + } + } + } +}; + +class WhileGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->HasInputs(kX); + ctx->HasOutputs(framework::GradVarName(kX)); + ctx->HasInputs(kOutputs); + ctx->HasInputs(framework::GradVarName(kOutputs)); + + auto p_names = ctx->Inputs(kX); + auto pg_names = ctx->Outputs(kXGRAD); + auto var_types = ctx->GetInputsVarType(kX); + std::vector names_to_set; + std::vector dims_to_set; + for (size_t i = 0; i < p_names.size(); ++i) { + if (pg_names[i] == framework::kEmptyVarName) { + continue; + } + auto dims = ctx->GetInputsElementDim(kX, i); + if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) { + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims); + } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) { + // not sure how to set the dim of LOD_TENSOR_ARRAY + names_to_set.push_back(pg_names[i]); + dims_to_set.push_back(dims); + } + } + ctx->SetDims(names_to_set, dims_to_set); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(while, paddle::operators::WhileOp, + paddle::operators::WhileOpMaker, + paddle::operators::WhileGradOpDescMaker); +REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp, + paddle::operators::WhileGradOpShapeInference, + paddle::operators::WhileGradOpVarTypeInference); diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/platform/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt similarity index 100% rename from paddle/platform/CMakeLists.txt rename to paddle/fluid/platform/CMakeLists.txt diff --git a/paddle/platform/assert.h b/paddle/fluid/platform/assert.h similarity index 100% rename from paddle/platform/assert.h rename to paddle/fluid/platform/assert.h diff --git a/paddle/platform/call_once.h b/paddle/fluid/platform/call_once.h similarity index 100% rename from paddle/platform/call_once.h rename to paddle/fluid/platform/call_once.h diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..47473aead0e512005a63e60b01170b41500dd1f6 --- /dev/null +++ b/paddle/fluid/platform/cpu_info.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef __APPLE__ +#include +#include +#else +#include +#endif + +#include "gflags/gflags.h" + +DEFINE_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +inline size_t CpuTotalPhysicalMemory() { +#ifdef __APPLE__ + int mib[2]; + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + int64_t size = 0; + size_t len = sizeof(size); + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; + return 0L; +#else + int64_t pages = sysconf(_SC_PHYS_PAGES); + int64_t page_size = sysconf(_SC_PAGE_SIZE); + return pages * page_size; +#endif +} + +size_t CpuMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. + return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); +} + +size_t CpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 4 KB. + return 1 << 12; +} + +size_t CpuMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 3% of CPU memory. + return CpuMaxAllocSize() / 32; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h similarity index 100% rename from paddle/platform/cpu_info.h rename to paddle/fluid/platform/cpu_info.h diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1fdba13b80629902faf2bcebb646572b36b459e --- /dev/null +++ b/paddle/fluid/platform/cpu_info_test.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/string/printf.h" + +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +DECLARE_double(fraction_of_cpu_memory_to_use); + +TEST(CpuMemoryUsage, Print) { + std::stringstream ss; + size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024; + float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100; + + std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n", + use_percent, memory_size) + << std::endl; +} diff --git a/paddle/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h similarity index 100% rename from paddle/platform/cuda_helper.h rename to paddle/fluid/platform/cuda_helper.h diff --git a/paddle/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h similarity index 100% rename from paddle/platform/cuda_profiler.h rename to paddle/fluid/platform/cuda_profiler.h diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..f2daa4f4fcc6fe43c2950b413024df7e301abf50 --- /dev/null +++ b/paddle/fluid/platform/cudnn_helper.h @@ -0,0 +1,286 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace platform { + +inline const char* cudnnGetErrorString(cudnnStatus_t status) { + switch (status) { + case CUDNN_STATUS_SUCCESS: + return "CUDNN_STATUS_SUCCESS"; + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDNN_STATUS_NOT_INITIALIZED"; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDNN_STATUS_ALLOC_FAILED"; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_INTERNAL_ERROR: + return "CUDNN_STATUS_INTERNAL_ERROR"; + case CUDNN_STATUS_INVALID_VALUE: + return "CUDNN_STATUS_INVALID_VALUE"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH"; + case CUDNN_STATUS_MAPPING_ERROR: + return "CUDNN_STATUS_MAPPING_ERROR"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_LICENSE_ERROR: + return "CUDNN_STATUS_LICENSE_ERROR"; + default: + return "Unknown cudnn error number"; + } +} + +#define CUDNN_VERSION_MIN(major, minor, patch) \ + (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) + +#define CUDNN_ENFORCE(condition) \ + do { \ + cudnnStatus_t status = condition; \ + if (status != CUDNN_STATUS_SUCCESS) { \ + VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \ + PADDLE_THROW("cuDNN call failed"); \ + } \ + } while (false) + +enum class DataLayout { // Not use + kNHWC, + kNCHW, + kNCDHW, + kNCHW_VECT_C, +}; + +enum class PoolingMode { + kMaximum, + kAverage, +}; + +template +class CudnnDataType; + +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_FLOAT; + typedef const float ScalingParamType; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; + typedef const double ScalingParamType; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + +inline cudnnTensorFormat_t GetCudnnTensorFormat( + const DataLayout& order) { // Not use + switch (order) { + case DataLayout::kNHWC: + return CUDNN_TENSOR_NHWC; + case DataLayout::kNCHW: + return CUDNN_TENSOR_NCHW; + case DataLayout::kNCDHW: + return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same + default: + PADDLE_THROW("Unknown cudnn equivalent for order"); + } + return CUDNN_TENSOR_NCHW; +} + +class ScopedTensorDescriptor { + public: + ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_)); + } + ~ScopedTensorDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_)); + } + + inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, + const cudnnDataType_t type, + const std::vector& dims, + const int groups = 1) { + // the format is not used now, will add later + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + // Update tensor descriptor dims setting if groups > 1 + // NOTE: Assume using NCHW or NCDHW order + std::vector dims_with_group(dims.begin(), dims.end()); // copy + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( + desc_, type, dims_with_group.size(), dims_with_group.data(), + strides.data())); + return desc_; + } + + template + inline cudnnTensorDescriptor_t descriptor(const DataLayout& order, + const std::vector& dims, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, dims, + groups); + } + + private: + cudnnTensorDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor); +}; + +class ScopedFilterDescriptor { + public: + ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_)); + } + ~ScopedFilterDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_)); + } + + inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, + const cudnnDataType_t type, + const std::vector& kernel, + const int groups = 1) { + // filter layout: MCHW(MCDHW), where M is the number of + // output image channels, C is the number of input image channels, + // D is the depth of the filter, H is the height of the filter, and W is the + // width of the filter. + std::vector kernel_with_group(kernel.begin(), kernel.end()); + if (groups > 1) { + kernel_with_group[0] /= groups; + // NOTE: input filter(C) of the filter is already asserted to be C/groups. + } + PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( + desc_, type, format, kernel_with_group.size(), + kernel_with_group.data())); + return desc_; + } + + template + inline cudnnFilterDescriptor_t descriptor(const DataLayout& order, + const std::vector& kernel, + const int groups = 1) { + return descriptor(GetCudnnTensorFormat(order), CudnnDataType::type, + kernel, groups); + } + + private: + cudnnFilterDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor); +}; + +class ScopedConvolutionDescriptor { + public: + ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_)); + } + ~ScopedConvolutionDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_)); + } + + inline cudnnConvolutionDescriptor_t descriptor( + cudnnDataType_t type, const std::vector& pads, + const std::vector& strides, const std::vector& dilations) { + PADDLE_ENFORCE_EQ(pads.size(), strides.size()); + PADDLE_ENFORCE_EQ(pads.size(), dilations.size()); + +#if !CUDNN_VERSION_MIN(6, 0, 0) + // cudnn v5 does not support dilation conv, the argument is called upscale + // instead of dilations and it is must be one. + for (size_t i = 0; i < dilations.size(); ++i) { + PADDLE_ENFORCE_EQ( + dilations[i], 1, + "Dilations conv is not supported in this cuDNN version(%d.%d.%d).", + CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100, + CUDNN_VERSION % 100); + } +#endif + + PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( + desc_, pads.size(), pads.data(), strides.data(), dilations.data(), + CUDNN_CROSS_CORRELATION, type)); + return desc_; + } + + template + inline cudnnConvolutionDescriptor_t descriptor( + const std::vector& pads, const std::vector& strides, + const std::vector& dilations) { + return descriptor(CudnnDataType::type, pads, strides, dilations); + } + + private: + cudnnConvolutionDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor); +}; + +class ScopedPoolingDescriptor { + public: + ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_)); + } + ~ScopedPoolingDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_)); + } + + inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode, + const std::vector& kernel, + const std::vector& pads, + const std::vector& strides) { + PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); + PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); + PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( + desc_, (mode == PoolingMode::kMaximum + ? CUDNN_POOLING_MAX + : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING), + CUDNN_PROPAGATE_NAN, // Always propagate nans. + kernel.size(), kernel.data(), pads.data(), strides.data())); + return desc_; + } + + private: + cudnnPoolingDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd0bd3fe3ed115c4a91723e1023851456da74890 --- /dev/null +++ b/paddle/fluid/platform/cudnn_helper_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/cudnn_helper.h" +#include + +TEST(CudnnHelper, ScopedTensorDescriptor) { + using paddle::platform::ScopedTensorDescriptor; + using paddle::platform::DataLayout; + + ScopedTensorDescriptor tensor_desc; + std::vector shape = {2, 4, 6, 6}; + auto desc = tensor_desc.descriptor(DataLayout::kNCHW, shape); + + cudnnDataType_t type; + int nd; + std::vector dims(4); + std::vector strides(4); + paddle::platform::dynload::cudnnGetTensorNdDescriptor( + desc, 4, &type, &nd, dims.data(), strides.data()); + + EXPECT_EQ(nd, 4); + for (size_t i = 0; i < dims.size(); ++i) { + EXPECT_EQ(dims[i], shape[i]); + } + EXPECT_EQ(strides[3], 1); + EXPECT_EQ(strides[2], 6); + EXPECT_EQ(strides[1], 36); + EXPECT_EQ(strides[0], 144); + + // test tensor5d: ScopedTensorDescriptor + ScopedTensorDescriptor tensor5d_desc; + std::vector shape_5d = {2, 4, 6, 6, 6}; + auto desc_5d = tensor5d_desc.descriptor(DataLayout::kNCDHW, shape_5d); + + std::vector dims_5d(5); + std::vector strides_5d(5); + paddle::platform::dynload::cudnnGetTensorNdDescriptor( + desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data()); + + EXPECT_EQ(nd, 5); + for (size_t i = 0; i < dims_5d.size(); ++i) { + EXPECT_EQ(dims_5d[i], shape_5d[i]); + } + EXPECT_EQ(strides_5d[4], 1); + EXPECT_EQ(strides_5d[3], 6); + EXPECT_EQ(strides_5d[2], 36); + EXPECT_EQ(strides_5d[1], 216); + EXPECT_EQ(strides_5d[0], 864); +} + +TEST(CudnnHelper, ScopedFilterDescriptor) { + using paddle::platform::ScopedFilterDescriptor; + using paddle::platform::DataLayout; + + ScopedFilterDescriptor filter_desc; + std::vector shape = {2, 3, 3}; + auto desc = filter_desc.descriptor(DataLayout::kNCHW, shape); + + cudnnDataType_t type; + int nd; + cudnnTensorFormat_t format; + std::vector kernel(3); + paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format, + &nd, kernel.data()); + + EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < shape.size(); ++i) { + EXPECT_EQ(kernel[i], shape[i]); + } + + ScopedFilterDescriptor filter_desc_4d; + std::vector shape_4d = {2, 3, 3, 3}; + auto desc_4d = filter_desc.descriptor(DataLayout::kNCDHW, shape_4d); + + std::vector kernel_4d(4); + paddle::platform::dynload::cudnnGetFilterNdDescriptor( + desc_4d, 4, &type, &format, &nd, kernel_4d.data()); + + EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format); + EXPECT_EQ(nd, 4); + for (size_t i = 0; i < shape_4d.size(); ++i) { + EXPECT_EQ(kernel_4d[i], shape_4d[i]); + } +} + +TEST(CudnnHelper, ScopedConvolutionDescriptor) { + using paddle::platform::ScopedConvolutionDescriptor; + + ScopedConvolutionDescriptor conv_desc; + std::vector src_pads = {2, 2, 2}; + std::vector src_strides = {1, 1, 1}; + std::vector src_dilations = {1, 1, 1}; + auto desc = conv_desc.descriptor(src_pads, src_strides, src_dilations); + + cudnnDataType_t type; + cudnnConvolutionMode_t mode; + int nd; + std::vector pads(3); + std::vector strides(3); + std::vector dilations(3); + paddle::platform::dynload::cudnnGetConvolutionNdDescriptor( + desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode, + &type); + + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < src_pads.size(); ++i) { + EXPECT_EQ(pads[i], src_pads[i]); + EXPECT_EQ(strides[i], src_strides[i]); + EXPECT_EQ(dilations[i], src_dilations[i]); + } + EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION); +} + +TEST(CudnnHelper, ScopedPoolingDescriptor) { + using paddle::platform::ScopedPoolingDescriptor; + using paddle::platform::PoolingMode; + + ScopedPoolingDescriptor pool_desc; + std::vector src_kernel = {2, 2, 5}; + std::vector src_pads = {1, 1, 2}; + std::vector src_strides = {2, 2, 3}; + auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads, + src_strides); + + cudnnPoolingMode_t mode; + cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN; + int nd; + std::vector kernel(3); + std::vector pads(3); + std::vector strides(3); + paddle::platform::dynload::cudnnGetPoolingNdDescriptor( + desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data()); + + EXPECT_EQ(nd, 3); + for (size_t i = 0; i < src_pads.size(); ++i) { + EXPECT_EQ(kernel[i], src_kernel[i]); + EXPECT_EQ(pads[i], src_pads[i]); + EXPECT_EQ(strides[i], src_strides[i]); + } + EXPECT_EQ(mode, CUDNN_POOLING_MAX); +} diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h similarity index 100% rename from paddle/platform/details/device_ptr_cast.h rename to paddle/fluid/platform/details/device_ptr_cast.h diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4da846bb1c25abc3d31006657652abaa5a11add --- /dev/null +++ b/paddle/fluid/platform/device_context.cc @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace platform { + +DeviceContextPool* DeviceContextPool::pool = nullptr; + +const platform::DeviceContext* DeviceContextPool::Get( + const platform::Place& place) { + auto it = device_contexts_.find(place); + if (it == device_contexts_.end()) { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + return it->second; +} + +DeviceContextPool::DeviceContextPool( + const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); + for (size_t i = 0; i < places.size(); i++) { + if (platform::is_cpu_place(places[i])) { + device_contexts_.emplace(places[i], + new platform::CPUDeviceContext( + boost::get(places[i]))); + } else if (platform::is_gpu_place(places[i])) { +#ifdef PADDLE_WITH_CUDA + device_contexts_.emplace(places[i], + new platform::CUDADeviceContext( + boost::get(places[i]))); +#else + PADDLE_THROW( + "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " + "option"); +#endif + } + } +} + +CPUDeviceContext::CPUDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place CPUDeviceContext::GetPlace() const { return place_; } + +#ifdef PADDLE_WITH_CUDA + +class EigenCudaStreamDevice : public Eigen::StreamInterface { + public: + EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenCudaStreamDevice() override {} + + void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const cudaStream_t& stream() const override { return *stream_; } + + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + return paddle::memory::Alloc(place_, num_bytes); + } + + void deallocate(void* buffer) const override { + paddle::memory::Free(place_, buffer); + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = + static_cast(scratchpad()) + Eigen::kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); + } + return semaphore_; + } + + private: + CUDAPlace place_; + const cudaStream_t* stream_; // not owned; + const cudaDeviceProp* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { + SetDeviceId(place_.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + eigen_stream_.reset(new EigenCudaStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); + eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + if (dynload::HasCUDNN()) { + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); + } else { + cudnn_handle_ = nullptr; + } +} + +CUDADeviceContext::~CUDADeviceContext() { + SetDeviceId(place_.device); + Wait(); + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + if (cudnn_handle_ != nullptr) { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + } + eigen_stream_.reset(); + eigen_device_.reset(); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +Place CUDADeviceContext::GetPlace() const { return place_; } + +void CUDADeviceContext::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaGetLastError()); +} + +Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +cublasHandle_t CUDADeviceContext::cublas_handle() const { + return cublas_handle_; +} + +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } + +cudaStream_t CUDADeviceContext::stream() const { return stream_; } + +#endif + +#ifdef PADDLE_WITH_MKLDNN +MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) + : CPUDeviceContext(place), ready_(false) { + stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); + engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0)); +} + +template +void MKLDNNDeviceContext::AddElement(const std::string& op_key, + const T& value) { + if (GetElement(op_key)) { + return; + } + GetElementPool().emplace(op_key, std::move(value)); +} + +template +const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const { + auto it = GetElementPool().find(op_key); + return it == GetElementPool().end() ? nullptr : it->second; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return memory_pool_; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return primitive_pool_; +} + +template <> +const std::unordered_map>& +MKLDNNDeviceContext::GetElementPool() const { + return primitive_desc_pool_; +} + +void MKLDNNDeviceContext::Execute(bool block) { + if (pipeline_.empty()) { + return; + } + ResetStream(); + stream_->submit(pipeline_).wait(block); + ready_ = false; + pipeline_.clear(); +} + +void MKLDNNDeviceContext::ResetStream() { + if (ready_) { + return; + } + // TODO(TJ): change me when mkldnn have specific method to reset this state + stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); + ready_ = true; +} + +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..10b581f41a1e9473a2f85d3e5d2e40ee1fdaa1af --- /dev/null +++ b/paddle/fluid/platform/device_context.h @@ -0,0 +1,210 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/gpu_info.h" +#define EIGEN_USE_GPU +#endif + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "unsupported/Eigen/CXX11/Tensor" + +#include "glog/logging.h" + +namespace paddle { +namespace platform { + +class DeviceContext { + public: + virtual ~DeviceContext() {} + virtual Place GetPlace() const = 0; + + virtual void Wait() const {} +}; + +class CPUDeviceContext : public DeviceContext { + public: + CPUDeviceContext(); + explicit CPUDeviceContext(CPUPlace place); + + Eigen::DefaultDevice* eigen_device() const; + + Place GetPlace() const override; + + private: + CPUPlace place_; + std::unique_ptr eigen_device_; +}; + +template +struct DefaultDeviceContextType; + +template <> +struct DefaultDeviceContextType { + using TYPE = CPUDeviceContext; +}; + +#ifdef PADDLE_WITH_CUDA + +class EigenCudaStreamDevice; + +class CUDADeviceContext : public DeviceContext { + public: + explicit CUDADeviceContext(CUDAPlace place); + virtual ~CUDADeviceContext(); + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + /*! \brief Return place in the device context. */ + Place GetPlace() const override; + + /*! \brief Return eigen device in the device context. */ + Eigen::GpuDevice* eigen_device() const; + + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle() const; + + /*! \brief Return cudnn handle in the device context. */ + cudnnHandle_t cudnn_handle() const; + + /*! \brief Return cuda stream in the device context. */ + cudaStream_t stream() const; + + private: + CUDAPlace place_; + + std::unique_ptr eigen_device_; + std::unique_ptr eigen_stream_; + + cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; + cublasHandle_t cublas_handle_; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = CUDADeviceContext; +}; + +#endif + +#ifdef PADDLE_WITH_MKLDNN +class MKLDNNDeviceContext : public CPUDeviceContext { + public: + explicit MKLDNNDeviceContext(CPUPlace place); + + /* \brief Add new element: memory, primitive or primitive desc */ + template + void AddElement(const std::string& op_key, const T& value); + + /* \brief Get existed element: memory, primitive or primitive desc */ + template + const T& GetElement(const std::string& op_key) const; + + /* \brief Get element pool: memory, primitive or primitive desc pool */ + template + const std::unordered_map>& + GetElementPool() const; + + /* \brief Get the active engine */ + const MKLDNNEngine& engine() const { return *engine_; } + + /* \brief Submit primitive to pipeline */ + void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); } + + /*! \brief Execute all submitted primitives in pipeline */ + void Execute(bool block = true); + + protected: + /*! \brief Reset the stream to prepare next exectue */ + void ResetStream(); + + private: + std::unordered_map> + memory_pool_; + std::unordered_map> + primitive_pool_; + std::unordered_map> + primitive_desc_pool_; + std::vector pipeline_; + MKLDNNStreamPtr stream_; + MKLDNNEnginePtr engine_; + bool ready_; +}; +#endif + +/*! \brief device context pool singleton */ +class DeviceContextPool { + public: + explicit DeviceContextPool(const std::vector& places); + + static DeviceContextPool& Instance() { + PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); + return *pool; + } + + /*! \brief Create should only called by Init function */ + static DeviceContextPool& Init(const std::vector& places) { + if (pool == nullptr) { + pool = new DeviceContextPool(places); + } + return *pool; + } + + /*! \brief Return handle of single device context. */ + const platform::DeviceContext* Get(const platform::Place& place); + + template + const typename DefaultDeviceContextType::TYPE* GetByPlace( + const Place& place) { + return reinterpret_cast< + const typename DefaultDeviceContextType::TYPE*>(Get(place)); + } + + size_t size() const { return device_contexts_.size(); } + + private: + static DeviceContextPool* pool; + constexpr static int LEFT_SHIFT = 8; + struct Hash { + std::hash hash_; + size_t operator()(const platform::Place& place) const { + int pre_hash = place.which() << LEFT_SHIFT; + if (platform::is_gpu_place(place)) { + pre_hash += boost::get(place).GetDeviceId(); + } + return hash_(pre_hash); + } + }; + std::unordered_map + device_contexts_; + DISABLE_COPY_AND_ASSIGN(DeviceContextPool); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f4dae6e90a8d12fc2dccab6fd3c6881e58e80fed --- /dev/null +++ b/paddle/fluid/platform/device_context_test.cu @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +#include "glog/logging.h" + +TEST(Device, Init) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); + ASSERT_NE(nullptr, gpu_device); + delete device_context; + } +} + +TEST(Device, CUDADeviceContext) { + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); + ASSERT_NE(nullptr, gpu_device); + cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); + ASSERT_NE(nullptr, cudnn_handle); + cublasHandle_t cublas_handle = device_context->cublas_handle(); + ASSERT_NE(nullptr, cublas_handle); + ASSERT_NE(nullptr, device_context->stream()); + delete device_context; + } +} + +TEST(Device, DeviceContextPool) { + using paddle::platform::DeviceContextPool; + using paddle::platform::CUDADeviceContext; + using paddle::platform::Place; + using paddle::platform::CPUPlace; + using paddle::platform::CUDAPlace; + + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); + + std::vector gpu_places; + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); + } +} + +int main(int argc, char** argv) { + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt similarity index 100% rename from paddle/platform/dynload/CMakeLists.txt rename to paddle/fluid/platform/dynload/CMakeLists.txt diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc new file mode 100644 index 0000000000000000000000000000000000000000..c599712554b1d6183c896eb7fc6ac5bbf71d67fc --- /dev/null +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/cublas.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cublas_dso_flag; +void *cublas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h new file mode 100644 index 0000000000000000000000000000000000000000..05f69e506515ac092c8509fba26e5b7a0f0823f7 --- /dev/null +++ b/paddle/fluid/platform/dynload/cublas.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cublas_dso_flag; +extern void *cublas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, \ + paddle::platform::dynload::GetCublasDsoHandle, \ + &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cublasStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ + DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) + +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSaxpy_v2); \ + __macro(cublasDaxpy_v2); \ + __macro(cublasSgemv_v2); \ + __macro(cublasDgemv_v2); \ + __macro(cublasSgemm_v2); \ + __macro(cublasDgemm_v2); \ + __macro(cublasSgeam_v2); \ + __macro(cublasDgeam_v2); \ + __macro(cublasCreate_v2); \ + __macro(cublasDestroy_v2); \ + __macro(cublasSetStream_v2); \ + __macro(cublasSetPointerMode_v2); \ + __macro(cublasGetPointerMode_v2); \ + __macro(cublasSgemmBatched); \ + __macro(cublasDgemmBatched); \ + __macro(cublasCgemmBatched); \ + __macro(cublasZgemmBatched); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasSgetrfBatched); \ + __macro(cublasSgetriBatched); \ + __macro(cublasDgetrfBatched); \ + __macro(cublasDgetriBatched) + +CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b1c4c4f9609ebb61674c07f6cfd615d9674dfcd --- /dev/null +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag cudnn_dso_flag; +void* cudnn_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); +CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3 +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R5 +CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); +#endif + +#ifdef CUDNN_DNN_ROUTINE_EACH_R7 +CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef PADDLE_USE_DSO +bool HasCUDNN() { + std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle); + return cudnn_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE(cudnn_dso_handle != nullptr, + "Cannot load cudnn shared library. Cannot invoke method %s", + fn_name); +} +#else +bool HasCUDNN() { return true; } +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h new file mode 100644 index 0000000000000000000000000000000000000000..00dfbc83872ed04d2b4e840e3f6ac31e89a8a3cd --- /dev/null +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cudnn_dso_flag; +extern void* cudnn_dso_handle; +extern bool HasCUDNN(); + +#ifdef PADDLE_USE_DSO + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, \ + paddle::platform::dynload::GetCUDNNDsoHandle, \ + &cudnn_dso_handle); \ + EnforceCUDNNLoaded(#__name); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +#else + +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#endif + +/** + * include all needed cudnn functions in HPPL + * different cudnn version has different interfaces + **/ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnGetErrorString); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(cudnnAddTensor); \ + __macro(cudnnConvolutionBackwardData); \ + __macro(cudnnConvolutionBackwardFilter); +CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R3: +#if CUDNN_VERSION >= 3000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ + __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ + __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); +CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +// APIs available after R4: +#if CUDNN_VERSION >= 4007 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(cudnnBatchNormalizationForwardTraining); \ + __macro(cudnnBatchNormalizationForwardInference); \ + __macro(cudnnBatchNormalizationBackward); +CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +// APIs in R5 +#if CUDNN_VERSION >= 5000 +#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(cudnnCreateActivationDescriptor); \ + __macro(cudnnSetActivationDescriptor); \ + __macro(cudnnGetActivationDescriptor); \ + __macro(cudnnDestroyActivationDescriptor); +CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +#if CUDNN_VERSION >= 7001 +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); +CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc new file mode 100644 index 0000000000000000000000000000000000000000..eac690b1458a4e53f6958b77d01d2c3c9f26f6eb --- /dev/null +++ b/paddle/fluid/platform/dynload/curand.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/curand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag curand_dso_flag; +void *curand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h new file mode 100644 index 0000000000000000000000000000000000000000..ce3115b3ce0dbaa30af63bfc15908f81c16309d1 --- /dev/null +++ b/paddle/fluid/platform/dynload/curand.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag curand_dso_flag; +extern void *curand_dso_handle; +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, \ + paddle::platform::dynload::GetCurandDsoHandle, \ + &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(curandCreateGenerator); \ + __macro(curandSetStream); \ + __macro(curandSetPseudoRandomGeneratorSeed); \ + __macro(curandGenerateUniform); \ + __macro(curandGenerateUniformDouble); \ + __macro(curandGenerateNormal); \ + __macro(curandDestroyGenerator); + +CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb00f93b7cde0a39beb4adabd910eef634c2581c --- /dev/null +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -0,0 +1,180 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(cudnn_dir, "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); + +DEFINE_string(cuda_dir, "", + "Specify path for loading cuda library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); + +DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); + +DEFINE_string(nccl_dir, "", + "Specify path for loading nccl library, such as libcublas, " + "libcurand. For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); + +namespace paddle { +namespace platform { +namespace dynload { + +static inline std::string join(const std::string& part1, + const std::string& part2) { + // directory separator + const char sep = '/'; + if (!part2.empty() && part2.front() == sep) { + return part2; + } + std::string ret; + ret.reserve(part1.size() + part2.size() + 1); + ret = part1; + if (!ret.empty() && ret.back() != sep) { + ret += sep; + } + ret += part2; + return ret; +} + +static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, + void** dso_handle, + int dynload_flags) { + VLOG(3) << "Try to find library: " << dso_path + << " from default system path."; + // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + +// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to +// bring System Integrity Projection (SIP), if dso_handle +// is null, search from default package path in Mac OS. +#if defined(__APPLE__) || defined(__OSX__) + if (nullptr == *dso_handle) { + dso_path = join("/usr/local/cuda/lib/", dso_path); + *dso_handle = dlopen(dso_path.c_str(), dynload_flags); + if (nullptr == *dso_handle) { + if (dso_path == "libcudnn.dylib") { + LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"; + } + } + } +#endif +} + +static inline void GetDsoHandleFromSearchPath(const std::string& search_root, + const std::string& dso_name, + void** dso_handle, + bool throw_on_error = true) { + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; + *dso_handle = nullptr; + + std::string dlPath = dso_name; + if (search_root.empty()) { + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } else { + // search xxx.so from custom path + dlPath = join(search_root, dso_name); + *dso_handle = dlopen(dlPath.c_str(), dynload_flags); + // if not found, search from default path + if (nullptr == *dso_handle) { + LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" + << dlerror() << ")"; + dlPath = dso_name; + GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); + } + } + auto error_msg = + "Failed to find dynamic library: %s ( %s ) \n Please specify " + "its path correctly using following ways: \n Method. set " + "environment variable LD_LIBRARY_PATH on Linux or " + "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " + "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " + "using the DYLD_LIBRARY_PATH is impossible unless System " + "Integrity Protection (SIP) is disabled."; + if (throw_on_error) { + PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror()); + } else if (nullptr == *dso_handle) { + LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + } +} + +void GetCublasDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); +#endif +} + +void GetCUDNNDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, + false); +#else + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false); +#endif +} + +void GetCurandDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); +#endif +} + +void GetWarpCTCDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); +#endif +} + +void GetLapackDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); +#endif +} + +void GetNCCLDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle); +#endif +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h similarity index 100% rename from paddle/platform/dynload/dynamic_loader.h rename to paddle/fluid/platform/dynload/dynamic_loader.h diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc new file mode 100644 index 0000000000000000000000000000000000000000..1dc3e96f04a4f98e5cf0cb7848a99213ca082b79 --- /dev/null +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/nccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nccl_dso_flag; +void *nccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +void LoadNCCLDSO() { + platform::call_once(nccl_dso_flag, + [] { GetNCCLDsoHandle(&nccl_dso_handle); }); +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h new file mode 100644 index 0000000000000000000000000000000000000000..349a4d0ba325fe1f7c23873c82dce0d35d295340 --- /dev/null +++ b/paddle/fluid/platform/dynload/nccl.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/call_once.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag nccl_dso_flag; +extern void* nccl_dso_handle; + +#ifdef PADDLE_USE_DSO +extern void LoadNCCLDSO(); + +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(__name(args...)) (*)(Args...); \ + paddle::platform::dynload::LoadNCCLDSO(); \ + void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + ncclResult_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define NCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ + __macro(ncclReduce); \ + __macro(ncclGetErrorString); + +NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc new file mode 100644 index 0000000000000000000000000000000000000000..84de2cae94790ca44c8c0f87d75a45a1b7001a64 --- /dev/null +++ b/paddle/fluid/platform/dynload/warpctc.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/warpctc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +WARPCTC_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h new file mode 100644 index 0000000000000000000000000000000000000000..f1955818dede54b0d9c53cb33b95c6102b854d7b --- /dev/null +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ctc.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag warpctc_dso_flag; +extern void* warpctc_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ + std::call_once(warpctc_dso_flag, \ + paddle::platform::dynload::GetWarpCTCDsoHandle, \ + &warpctc_dso_handle); \ + void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + DYNAMIC_LOAD_WARPCTC_WRAP(__name) + +#define WARPCTC_ROUTINE_EACH(__macro) \ + __macro(get_warpctc_version); \ + __macro(ctcGetStatusString); \ + __macro(compute_ctc_loss); \ + __macro(get_workspace_size) + +WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.cc b/paddle/fluid/platform/enforce.cc new file mode 100644 index 0000000000000000000000000000000000000000..55cd80943cf18545d3cac6f3ebddee1030d62b37 --- /dev/null +++ b/paddle/fluid/platform/enforce.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform {} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h new file mode 100644 index 0000000000000000000000000000000000000000..b22893c0a56f39347894924b3cd6ea64180aa8b6 --- /dev/null +++ b/paddle/fluid/platform/enforce.h @@ -0,0 +1,258 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for dladdr +#include // for backtrace +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/macros.h" +#include "paddle/string/printf.h" +#include "paddle/string/to_string.h" + +#ifdef __GNUC__ +#include // for __cxa_demangle +#endif + +#include + +#ifdef PADDLE_WITH_CUDA + +#include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/fluid/platform/dynload/curand.h" +#include "paddle/fluid/platform/dynload/nccl.h" + +#include +#include +#include +#include +#include + +#endif + +namespace paddle { +namespace platform { + +#ifdef __GNUC__ +inline std::string demangle(std::string name) { + int status = -4; // some arbitrary value to eliminate the compiler warning + std::unique_ptr res{ + abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free}; + return (status == 0) ? res.get() : name; +} +#else +inline std::string demangle(std::string name) { return name; } +#endif + +struct EnforceNotMet : public std::exception { + std::exception_ptr exp_; + std::string err_str_; + EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) { + static constexpr int TRACE_STACK_LIMIT = 100; + try { + std::rethrow_exception(exp_); + } catch (const std::exception& exp) { + std::ostringstream sout; + + sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; + sout << "PaddlePaddle Call Stacks: " << std::endl; + + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + + Dl_info info; + for (int i = 0; i < size; ++i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + auto addr_offset = static_cast(call_stack[i]) - + static_cast(info.dli_saddr); + sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, + 2 + sizeof(void*) * 2, call_stack[i], + demangled, addr_offset); + } else { + sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, + call_stack[i]); + } + } + free(symbols); + err_str_ = sout.str(); + } + } + + const char* what() const noexcept { return err_str_.c_str(); } +}; + +// Because most enforce conditions would evaluate to true, we can use +// __builtin_expect to instruct the C++ compiler to generate code that +// always forces branch prediction of true. +// This generates faster binary code. __builtin_expect is since C++11. +// For more details, please check https://stackoverflow.com/a/43870188/724872. +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) + +template +inline typename std::enable_if::type throw_on_error( + bool stat, const Args&... args) { + if (UNLIKELY(!(stat))) { + throw std::runtime_error(string::Sprintf(args...)); + } +} + +#ifdef PADDLE_WITH_CUDA + +template +inline typename std::enable_if::type throw_on_error( + cudaError_t e, const Args&... args) { + if (UNLIKELY(e)) { + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + curandStatus_t stat, const Args&... args) { + if (stat != CURAND_STATUS_SUCCESS) { + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + cudnnStatus_t stat, const Args&... args) { + if (stat == CUDNN_STATUS_SUCCESS) { + return; + } else { + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); + } +} + +template +inline typename std::enable_if::type throw_on_error( + cublasStatus_t stat, const Args&... args) { + std::string err; + if (stat == CUBLAS_STATUS_SUCCESS) { + return; + } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + err = "CUBLAS: not initialized, "; + } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { + err = "CUBLAS: alloc failed, "; + } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { + err = "CUBLAS: invalid value, "; + } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { + err = "CUBLAS: arch mismatch, "; + } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { + err = "CUBLAS: mapping error, "; + } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { + err = "CUBLAS: execution failed, "; + } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { + err = "CUBLAS: internal error, "; + } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { + err = "CUBLAS: not supported, "; + } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { + err = "CUBLAS: license error, "; + } + throw std::runtime_error(err + string::Sprintf(args...)); +} + +template +inline typename std::enable_if::type throw_on_error( + ncclResult_t stat, const Args&... args) { + if (stat == ncclSuccess) { + return; + } else { + throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) + + string::Sprintf(args...)); + } +} + +#endif // PADDLE_ONLY_CPU + +template +inline void throw_on_error(T e) { + throw_on_error(e, ""); +} + +#define PADDLE_THROW(...) \ + do { \ + throw ::paddle::platform::EnforceNotMet( \ + std::make_exception_ptr( \ + std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ + __FILE__, __LINE__); \ + } while (false) + +#define PADDLE_ENFORCE(...) \ + do { \ + try { \ + ::paddle::platform::throw_on_error(__VA_ARGS__); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ + } while (false) + +/* + * Some enforce helpers here, usage: + * int a = 1; + * int b = 2; + * PADDLE_ENFORCE_EQ(a, b); + * + * will raise an expression described as follows: + * "enforce a == b failed, 1 != 2" with detailed stack information. + * + * extra messages is also supported, for example: + * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) + */ + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + do { \ + if (UNLIKELY(nullptr == (__VAL))) { \ + PADDLE_THROW(#__VAL " should not be null\n%s", \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + do { \ + if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ + PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \ + " %s\n%s", \ + #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ + paddle::string::to_string(__VAL1), \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..896a9a04eca80f22bebf87859d277458d9bdb092 --- /dev/null +++ b/paddle/fluid/platform/enforce_test.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/string/piece.h" + +using StringPiece = paddle::string::Piece; +using paddle::string::HasPrefix; + +TEST(ENFORCE, OK) { + PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345); + size_t val = 1; + const size_t limit = 10; + PADDLE_ENFORCE(val < limit, "Enforce is OK too"); +} + +TEST(ENFORCE, FAILED) { + bool caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE, NO_ARG_OK) { + int a = 2; + int b = 2; + PADDLE_ENFORCE_EQ(a, b); + // test enforce with extra message. + PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info"); +} + +TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) { + int a = 2; + bool caught_exception = false; + try { + PADDLE_ENFORCE_EQ(a, 1 + 3); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4"); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) { + int a = 2; + bool caught_exception = false; + try { + PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + HasPrefix(StringPiece(error.what()), + "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match"); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_NE, OK) { + PADDLE_ENFORCE_NE(1, 2); + PADDLE_ENFORCE_NE(1.0, 2UL); +} +TEST(ENFORCE_NE, FAIL) { + bool caught_exception = false; + + try { + // 2UL here to check data type compatible + PADDLE_ENFORCE_NE(1.0, 1UL); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), + "enforce 1.0 != 1UL failed, 1 == 1")) + << error.what() << " does not have expected prefix"; + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } +TEST(ENFORCE_GT, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_GE, OK) { + PADDLE_ENFORCE_GE(2, 2UL); + PADDLE_ENFORCE_GE(3, 2UL); + PADDLE_ENFORCE_GE(3, 2); + PADDLE_ENFORCE_GE(3.21, 2UL); +} +TEST(ENFORCE_GE, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GE(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_LE, OK) { + PADDLE_ENFORCE_LE(1, 1); + PADDLE_ENFORCE_LE(1, 1UL); + PADDLE_ENFORCE_LE(2, 3UL); + PADDLE_ENFORCE_LE(2UL, 3); + PADDLE_ENFORCE_LE(2UL, 3.2); +} +TEST(ENFORCE_LE, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_GT(1, 2UL); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_LT, OK) { + PADDLE_ENFORCE_LT(3, 10); + PADDLE_ENFORCE_LT(2, 3UL); + PADDLE_ENFORCE_LT(2UL, 3); +} +TEST(ENFORCE_LT, FAIL) { + bool caught_exception = false; + try { + PADDLE_ENFORCE_LT(1UL, 0.12); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), + "enforce 1UL < 0.12 failed, 1 >= 0.12")); + } + EXPECT_TRUE(caught_exception); +} + +TEST(ENFORCE_NOT_NULL, OK) { + int* a = new int; + PADDLE_ENFORCE_NOT_NULL(a); + delete a; +} +TEST(ENFORCE_NOT_NULL, FAIL) { + bool caught_exception = false; + try { + int* a = nullptr; + PADDLE_ENFORCE_NOT_NULL(a); + + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null")); + } + EXPECT_TRUE(caught_exception); +} + +struct Dims { + size_t dims_[4]; + + bool operator==(const Dims& o) const { + for (size_t i = 0; i < 4; ++i) { + if (dims_[i] != o.dims_[i]) return false; + } + return true; + } +}; + +std::ostream& operator<<(std::ostream& os, const Dims& d) { + for (size_t i = 0; i < 4; ++i) { + if (i == 0) { + os << "["; + } + os << d.dims_[i]; + if (i == 4 - 1) { + os << "]"; + } else { + os << ", "; + } + } + return os; +} + +TEST(ENFORCE_USER_DEFINED_CLASS, EQ) { + Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}}; + PADDLE_ENFORCE_EQ(a, b); +} + +TEST(ENFORCE_USER_DEFINED_CLASS, NE) { + Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}}; + ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h new file mode 100644 index 0000000000000000000000000000000000000000..0e695328c394cd5cd1a14f42b7c82e8899e2167b --- /dev/null +++ b/paddle/fluid/platform/for_range.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +template +struct ForRange { + ForRange(const DeviceContext& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +#ifdef __NVCC__ +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, int limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + + template + inline void operator()(Function func) const { + constexpr int num_threads = 1024; + int block_size = limit_ <= num_threads ? limit_ : num_threads; + int grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const CUDADeviceContext& dev_ctx_; + int limit_; +}; + +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..1797f59a9c9731d28febe32d268d2b07073550eb --- /dev/null +++ b/paddle/fluid/platform/gpu_info.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/gpu_info.h" + +#include "gflags/gflags.h" + +#include "paddle/fluid/platform/enforce.h" + +DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, + "Default use 92% of GPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +namespace paddle { +namespace platform { + +int GetCUDADeviceCount() { + int count; + PADDLE_ENFORCE( + cudaGetDeviceCount(&count), + "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount"); + return count; +} + +int GetCurrentDeviceId() { + int device_id; + PADDLE_ENFORCE( + cudaGetDevice(&device_id), + "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId"); + return device_id; +} + +void SetDeviceId(int id) { + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE(cudaSetDevice(id), + "cudaSetDevice failed in paddle::platform::SetDeviceId"); +} + +void GpuMemoryUsage(size_t &available, size_t &total) { + PADDLE_ENFORCE(cudaMemGetInfo(&available, &total), + "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); +} + +size_t GpuMaxAllocSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + + // Reserve the rest for page tables, etc. + return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t GpuMaxChunkSize() { + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(available, total); + VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" + << total / 1024 / 1024 << "M"; + size_t reserving = static_cast(0.05 * total); + // If available less than minimum chunk size, no usable memory exists. + available = + std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), + total - reserving); + + // Reserving the rest memory for page tables, etc. + + size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * + (total - reserving)); + + PADDLE_ENFORCE_LE(allocating, available); + + return allocating; +} + +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); +} + +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream) { + PADDLE_ENFORCE( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), + "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); +} + +void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), + "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); +} +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h similarity index 100% rename from paddle/platform/gpu_info.h rename to paddle/fluid/platform/gpu_info.h diff --git a/paddle/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h similarity index 100% rename from paddle/platform/hostdevice.h rename to paddle/fluid/platform/hostdevice.h diff --git a/paddle/platform/macros.h b/paddle/fluid/platform/macros.h similarity index 100% rename from paddle/platform/macros.h rename to paddle/fluid/platform/macros.h diff --git a/paddle/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h similarity index 100% rename from paddle/platform/mkldnn_helper.h rename to paddle/fluid/platform/mkldnn_helper.h diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..75b95aff1a41dac70f1b732938c648ca55b2a973 --- /dev/null +++ b/paddle/fluid/platform/nccl_test.cu @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/init.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" + +static int dev_count = 0; + +namespace paddle { +namespace platform { + +TEST(NCCL, init) { + std::vector comms; + comms.resize(dev_count); + PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); + + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} + +template +struct PerThreadData { + thrust::device_vector send_buff; + thrust::device_vector recv_buff; + CUDADeviceContext dev_ctx; + + T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); } + + T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } + + PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { + send_buff.resize(size); + for (size_t i = 0; i < size; ++i) { + send_buff[i] = static_cast(i); + } + recv_buff.resize(size); + } +}; + +static constexpr int ELEM_COUNT = 10000; + +TEST(NCCL, all_reduce) { + std::vector comms; + comms.resize(dev_count); + VLOG(1) << "Initializing ncclComm"; + dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); + VLOG(1) << "ncclComm initialized"; + VLOG(1) << "Creating thread data"; + std::vector>> data; + data.reserve(dev_count); + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Creating thread data for device " << i; + SetDeviceId(i); + data.emplace_back(new PerThreadData(i, ELEM_COUNT)); + } + VLOG(1) << "Thread data created"; + + VLOG(1) << "Check send_buf data"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Check on device " << i; + SetDeviceId(i); + thrust::host_vector tmp = data[i]->send_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + ASSERT_NEAR(static_cast(j), tmp[j], 1e-5); + } + } + + VLOG(1) << "Invoking ncclAllReduce"; + + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Invoking ncclAllReduce with device " << i; + SetDeviceId(i); + PADDLE_ENFORCE(dynload::ncclAllReduce( + data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble, + ncclSum, comms[i], data[i]->dev_ctx.stream())); + VLOG(1) << "Invoked ncclAllReduce for device " << i; + } + + VLOG(1) << "Invoked ncclAllReduce"; + + VLOG(1) << "Sync devices"; + for (int i = 0; i < dev_count; ++i) { + VLOG(1) << "Sync device " << i; + SetDeviceId(i); + data[i]->dev_ctx.Wait(); + } + VLOG(1) << "device synced"; + + for (int i = 0; i < dev_count; ++i) { + SetDeviceId(i); + VLOG(1) << "Checking vector on device " << i; + thrust::host_vector tmp = data[i]->recv_buff; + for (size_t j = 0; j < tmp.size(); ++j) { + auto elem = static_cast(j); + elem *= dev_count; + ASSERT_NEAR(tmp[j], elem, 1e-4); + } + } + + for (int i = 0; i < dev_count; ++i) { + dynload::ncclCommDestroy(comms[i]); + } +} +} // namespace platform +} // namespace paddle + +int main(int argc, char** argv) { + // FIXME(tonyyang-svail): + // Due to the driver issue on our CI, disable for now + return 0; + dev_count = paddle::platform::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << dev_count; + return 0; + } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::CUDAPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Init(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc new file mode 100644 index 0000000000000000000000000000000000000000..e99b75d761abcf065070f463d578171797383cea --- /dev/null +++ b/paddle/fluid/platform/place.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +namespace detail { + +class PlacePrinter : public boost::static_visitor<> { + public: + explicit PlacePrinter(std::ostream &os) : os_(os) {} + void operator()(const CPUPlace &) { os_ << "CPUPlace"; } + void operator()(const CUDAPlace &p) { + os_ << "CUDAPlace(" << p.device << ")"; + } + + private: + std::ostream &os_; +}; + +} // namespace detail + +static Place the_default_place; + +void set_place(const Place &place) { the_default_place = place; } +const Place &get_place() { return the_default_place; } + +const CUDAPlace default_gpu() { return CUDAPlace(0); } +const CPUPlace default_cpu() { return CPUPlace(); } + +bool is_gpu_place(const Place &p) { + return boost::apply_visitor(IsCUDAPlace(), p); +} + +bool is_cpu_place(const Place &p) { return !is_gpu_place(p); } + +bool places_are_same_class(const Place &p1, const Place &p2) { + return p1.which() == p2.which(); +} + +bool is_same_place(const Place &p1, const Place &p2) { + if (places_are_same_class(p1, p2)) { + if (is_cpu_place(p1)) { + return true; + } else { + return boost::get(p1) == boost::get(p2); + } + } else { + return false; + } +} + +std::ostream &operator<<(std::ostream &os, const Place &p) { + detail::PlacePrinter printer(os); + boost::apply_visitor(printer, p); + return os; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h new file mode 100644 index 0000000000000000000000000000000000000000..2977a41036e84fc5aabd69c24cc9e62391b7dc38 --- /dev/null +++ b/paddle/fluid/platform/place.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace platform { + +struct CPUPlace { + // WORKAROUND: for some reason, omitting this constructor + // causes errors with boost 1.59 and OSX + CPUPlace() {} + + // needed for variant equality comparison + inline bool operator==(const CPUPlace &) const { return true; } + inline bool operator!=(const CPUPlace &) const { return false; } +}; + +struct CUDAPlace { + CUDAPlace() : CUDAPlace(0) {} + explicit CUDAPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const CUDAPlace &o) const { + return device == o.device; + } + inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } + + int device; +}; + +struct IsCUDAPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const CUDAPlace &gpu) const { return true; } +}; + +typedef boost::variant Place; + +using PlaceList = std::vector; + +void set_place(const Place &); +const Place &get_place(); + +const CUDAPlace default_gpu(); +const CPUPlace default_cpu(); + +bool is_gpu_place(const Place &); +bool is_cpu_place(const Place &); +bool places_are_same_class(const Place &, const Place &); +bool is_same_place(const Place &, const Place &); + +std::ostream &operator<<(std::ostream &, const Place &); + +template +struct PlaceVisitorWrapper + : public boost::static_visitor { + const Visitor &visitor_; + explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {} + + typename Visitor::result_type operator()(const CPUPlace &cpu) const { + return visitor_(cpu); + } + + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { +#ifdef PADDLE_WITH_CUDA + return visitor_(cuda); +#else + PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device"); + return typename Visitor::result_type(); +#endif + } +}; + +template +typename Visitor::result_type VisitPlace(const Place &place, + const Visitor &visitor) { + return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f248902d91c1ffad1364a2f1078a41626b61ac22 --- /dev/null +++ b/paddle/fluid/platform/place_test.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/place.h" +#include +#include "gtest/gtest.h" + +TEST(Place, Equality) { + paddle::platform::CPUPlace cpu; + paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); + + EXPECT_EQ(cpu, cpu); + EXPECT_EQ(g0, g0); + EXPECT_EQ(g1, g1); + EXPECT_EQ(g0, gg0); + + EXPECT_NE(g0, g1); + + EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); + EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu)); +} + +TEST(Place, Default) { + EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place())); + EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu())); + EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu())); + + EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place())); + paddle::platform::set_place(paddle::platform::CPUPlace()); + EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place())); +} + +TEST(Place, Print) { + { + std::stringstream ss; + ss << paddle::platform::CUDAPlace(1); + EXPECT_EQ("CUDAPlace(1)", ss.str()); + } + { + std::stringstream ss; + ss << paddle::platform::CPUPlace(); + EXPECT_EQ("CPUPlace", ss.str()); + } +} diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..28d2675f799a2d398d43dc31c550f0d84424116e --- /dev/null +++ b/paddle/fluid/platform/profiler.cc @@ -0,0 +1,346 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler.h" +#include +#include +#include "glog/logging.h" + +namespace paddle { +namespace platform { + +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// To record which timer the profiler used, CUDA or CPU. +static std::string g_profiler_place = ""; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + const DeviceContext* dev_ctx) + : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; + if (has_cuda_) { + auto* cuda_dev_ctx = static_cast(dev_ctx); + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedMs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000000.0); +} + +double Event::CudaElapsedMs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); +} + +void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); +} + +void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, + const DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + name_ = name; + PushEvent(name_, dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + PopEvent(name_, dev_ctx_); +} + +void EnableProfiler(ProfilerState state) { + PADDLE_ENFORCE(state != ProfilerState::kDisabled, + "Can't enbale profling, since the input state is ", + "ProfilerState::kDisabled"); + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, + "The profiling state should be disabled when calling ", + "EnableProfiler."); + g_state = state; + g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; +#ifdef PADDLE_WITH_CUDA + if (g_state == ProfilerState::kCUDA) { + // Generate some dummy evenets first to reduce the startup overhead. + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); + Mark("_cuda_startup_", dev_ctx); + dev_ctx->Wait(); + delete dev_ctx; + }); + } + } +#endif + // Mark the profiling start. + Mark("_start_profiler_", nullptr); +} + +void ResetProfiler() { + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + (*it)->Clear(); + } +} + +std::vector> GetAllEvents() { + std::lock_guard guard(g_all_event_lists_mutex); + std::vector> result; + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); + } + return result; +} + +void DisableProfiler(EventSortingKey sorted_key) { + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; + + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); +} + +void ParseEvents(std::vector>& events, + EventSortingKey sorted_by) { + if (g_profiler_place == "") return; + + std::string sorted_domain; + std::function sorted_func; + switch (sorted_by) { + case EventSortingKey::kCalls: + sorted_domain = "number of calls"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.calls > b.calls; + }; + break; + case EventSortingKey::kTotal: + sorted_domain = "total time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.total_time > b.total_time; + }; + break; + case EventSortingKey::kMin: + sorted_domain = "minimum time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.min_time > b.min_time; + }; + break; + case EventSortingKey::kMax: + sorted_domain = "maximum time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.max_time > b.max_time; + }; + break; + case EventSortingKey::kAve: + sorted_domain = "average time"; + sorted_func = [](const EventItem& a, const EventItem& b) { + return a.ave_time > b.ave_time; + }; + break; + default: + sorted_domain = "event first end time"; + } + + std::vector> events_table; + size_t max_name_width = 0; + for (size_t i = 0; i < events.size(); i++) { + std::list pushed_events; + std::vector event_items; + std::unordered_map event_idx; + + for (size_t j = 0; j < events[i].size(); j++) { + if (events[i][j].kind() == "push") { + pushed_events.push_back(events[i][j]); + } else if (events[i][j].kind() == "pop") { + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend() && + rit->name() != events[i][j].name()) { + ++rit; + } + + if (rit != pushed_events.rend()) { + double event_time = (g_profiler_place == "CUDA") + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); + std::string event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + max_name_width = std::max(max_name_width, event_name.size()); + + if (event_idx.find(event_name) == event_idx.end()) { + event_idx[event_name] = event_items.size(); + EventItem event_item = {event_name, 1, event_time, + event_time, event_time, event_time}; + event_items.push_back(event_item); + } else { + int index = event_idx[event_name]; + event_items[index].calls += 1; + // total time + event_items[index].total_time += event_time; + // min time + event_items[index].min_time = + std::min(event_time, event_items[index].min_time); + // max time + event_items[index].max_time = + std::max(event_time, event_items[index].max_time); + } + + // remove the push marker from the list + pushed_events.erase((++rit).base()); + } else { + LOG(WARNING) << "Cannot find the push marker of event \'" + << events[i][j].name() + << "\', which will be ignored in profiling report."; + } + } + } + // average time + for (auto& item : event_items) { + item.ave_time = item.total_time / item.calls; + } + // sort + if (sorted_by != EventSortingKey::kDefault) { + std::sort(event_items.begin(), event_items.end(), sorted_func); + } + + events_table.push_back(event_items); + // log warning if there are events with `push` but without `pop` + std::list::reverse_iterator rit = pushed_events.rbegin(); + while (rit != pushed_events.rend()) { + LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name() + << "\', which will be ignored in profiling report."; + ++rit; + } + } + + // Print report + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); +} + +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width) { + // Output header information + std::cout << "\n------------------------->" + << " Profiling Report " + << "<-------------------------\n\n"; + std::cout << "Place: " << g_profiler_place << std::endl; + std::cout << "Time unit: ms" << std::endl; + std::cout << "Sorted by " << sorted_domain + << " in descending order in the same thread\n\n"; + // Output events table + std::cout.setf(std::ios::left); + std::cout << std::setw(name_width) << "Event" << std::setw(data_width) + << "Calls" << std::setw(data_width) << "Total" + << std::setw(data_width) << "Min." << std::setw(data_width) + << "Max." << std::setw(data_width) << "Ave." << std::endl; + for (size_t i = 0; i < events_table.size(); ++i) { + for (size_t j = 0; j < events_table[i].size(); ++j) { + EventItem& event_item = events_table[i][j]; + std::cout << std::setw(name_width) << event_item.name + << std::setw(data_width) << event_item.calls + << std::setw(data_width) << event_item.total_time + << std::setw(data_width) << event_item.min_time + << std::setw(data_width) << event_item.max_time + << std::setw(data_width) << event_item.ave_time << std::endl; + } + } + std::cout << std::endl; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..0bc5e666cb4f28b99169435cb3dd52829c35a2c2 --- /dev/null +++ b/paddle/fluid/platform/profiler.h @@ -0,0 +1,150 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +enum EventKind { kMark, kPushRange, kPopRange }; + +class Event { + public: + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. + Event(EventKind kind, std::string name, uint32_t thread_id, + const DeviceContext* dev_ctx); + + std::string kind() const; + std::string name() const { return name_; } + uint32_t thread_id() const { return thread_id_; } + bool has_cuda() const { return has_cuda_; } + +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event() const { return event_; } + int device() const { return device_; } +#endif + + double CpuElapsedMs(const Event& e) const; + double CudaElapsedMs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; + bool has_cuda_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + void Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + void Clear() { event_blocks.clear(); } + + std::forward_list> event_blocks; +}; + +enum ProfilerState { + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state +}; + +void Mark(const std::string& name, const DeviceContext* dev_ctx); + +void PushEvent(const std::string& name, const DeviceContext* dev_ctx); + +void PopEvent(const std::string& name, const DeviceContext* dev_ctx); + +struct RecordEvent { + explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx); + + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + const DeviceContext* dev_ctx_; + // Event name + std::string name_; +}; + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. +std::vector> GetAllEvents(); + +// The information of each event given in the profiling report +struct EventItem { + std::string name; + int calls; + double total_time; + double min_time; + double max_time; + double ave_time; +}; + +// Candidate keys to sort the profiling report +enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; + +// Enable the profiling function. +void EnableProfiler(ProfilerState state); + +// Clear the g_all_event_lists, which is total event lists of all threads. +void ResetProfiler(); + +void DisableProfiler(EventSortingKey sorted_key); + +// Parse the event list and output the profiling report +void ParseEvents(std::vector>&, + EventSortingKey sorted_by = EventSortingKey::kDefault); + +// Print results +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width); + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2525c38b6fb98ea9bac49f7eb28e755bd7fa9a2 --- /dev/null +++ b/paddle/fluid/platform/profiler_test.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler.h" +#include "gtest/gtest.h" + +TEST(Event, CpuElapsedTime) { + using paddle::platform::Event; + using paddle::platform::EventKind; + + Event start_event(EventKind::kPushRange, "test", 0, nullptr); + EXPECT_TRUE(start_event.has_cuda() == false); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); + EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(Event, CudaElapsedTime) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + using paddle::platform::Event; + using paddle::platform::EventKind; + + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); + Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); + EXPECT_TRUE(start_event.has_cuda() == true); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); + EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); +} +#endif + +TEST(RecordEvent, RecordEvent) { + using paddle::platform::DeviceContext; + using paddle::platform::Event; + using paddle::platform::EventKind; + using paddle::platform::RecordEvent; + using paddle::platform::ProfilerState; + using paddle::platform::EventSortingKey; + + ProfilerState state = ProfilerState::kCPU; + DeviceContext* dev_ctx = nullptr; +#ifdef PADDLE_WITH_CUDA + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + state = ProfilerState::kCUDA; + dev_ctx = + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); +#endif + EnableProfiler(state); + + /* Usage 1: + * PushEvent(evt_name, dev_ctx); + * ... + * code to be analyzed + * ... + * PopEvent(evt_name, dev_ctx); + */ + for (int loop = 0; loop < 3; ++loop) { + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + PushEvent(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + PopEvent(name, dev_ctx); + } + } + + /* Usage 2: + * { + * RecordEvent record_event(name, dev_ctx); + * ... + * code to be analyzed + * ... + * } + */ + for (int i = 1; i < 5; ++i) { + std::string name = "evs_op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + } + + // Bad Usage: + PushEvent("event_without_pop", dev_ctx); + PopEvent("event_without_push", dev_ctx); + std::vector> events = paddle::platform::GetAllEvents(); + + int cuda_startup_count = 0; + int start_profiler_count = 0; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; + if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; + if (events[i][j].name() == "push") { + EXPECT_EQ(events[i][j + 1].name(), "pop"); +#ifdef PADDLE_WITH_CUDA + EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); +#else + EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); +#endif + } + } + } + EXPECT_EQ(cuda_startup_count % 5, 0); + EXPECT_EQ(start_profiler_count, 1); + + // Will remove parsing-related code from test later + DisableProfiler(EventSortingKey::kTotal); +} diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h new file mode 100644 index 0000000000000000000000000000000000000000..879daed19102c85cc5ea03933f8324023cec0fe2 --- /dev/null +++ b/paddle/fluid/platform/transform.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#ifdef __NVCC__ +#include +#include +#include "paddle/fluid/platform/details/device_ptr_cast.h" +#endif + +namespace paddle { +namespace platform { + +// Transform on host or device. It provides the same API in std library. +template +struct Transform { + template + void operator()(const DeviceContext& context, InputIter first, InputIter last, + OutputIter result, UnaryOperation op); + + template + void operator()(const DeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op); +}; + +template <> +struct Transform { + template + void operator()(const platform::CPUDeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { + std::transform(first, last, result, op); + } + + template + void operator()(const platform::CPUDeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op) { + std::transform(first1, last1, first2, result, op); + } +}; + +#ifdef __NVCC__ +template <> +struct Transform { + template + void operator()(const platform::CUDADeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { + auto place = context.GetPlace(); + PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); + thrust::transform(thrust::cuda::par.on(context.stream()), + details::DevPtrCast(first), details::DevPtrCast(last), + details::DevPtrCast(result), op); + } + + template + void operator()(const platform::CUDADeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op) { + auto place = context.GetPlace(); + PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); + thrust::transform(thrust::cuda::par.on(context.stream()), + details::DevPtrCast(first1), details::DevPtrCast(last1), + details::DevPtrCast(first2), details::DevPtrCast(result), + op); + } +}; +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e4b9edc2fd45e9c00f5339948172f6267210363 --- /dev/null +++ b/paddle/fluid/platform/transform_test.cu @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/transform.h" + +template +class Scale { + public: + explicit Scale(const T& scale) : scale_(scale) {} + + HOSTDEVICE T operator()(const T& a) const { return a * scale_; } + + private: + T scale_; +}; + +template +class Multiply { + public: + HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } +}; + +TEST(Transform, CPUUnary) { + using namespace paddle::platform; + CPUDeviceContext ctx; + float buf[4] = {0.1, 0.2, 0.3, 0.4}; + Transform trans; + trans(ctx, buf, buf + 4, buf, Scale(10)); + for (int i = 0; i < 4; ++i) { + ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); + } +} + +TEST(Transform, GPUUnary) { + using namespace paddle::platform; + using namespace paddle::memory; + CUDAPlace gpu0(0); + CUDADeviceContext ctx(gpu0); + float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; + float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); + Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); + Transform trans; + trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); + ctx.Wait(); + Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); + Free(gpu0, gpu_buf); + for (int i = 0; i < 4; ++i) { + ASSERT_NEAR(cpu_buf[i], static_cast(i + 1), 1e-5); + } +} + +TEST(Transform, CPUBinary) { + using namespace paddle::platform; + using namespace paddle::memory; + int buf[4] = {1, 2, 3, 4}; + Transform trans; + CPUDeviceContext ctx; + trans(ctx, buf, buf + 4, buf, buf, Multiply()); + for (int i = 0; i < 4; ++i) { + ASSERT_EQ((i + 1) * (i + 1), buf[i]); + } +} + +TEST(Transform, GPUBinary) { + using namespace paddle::platform; + using namespace paddle::memory; + int buf[4] = {1, 2, 3, 4}; + CUDAPlace gpu0(0); + CUDADeviceContext ctx(gpu0); + int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); + Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); + Transform trans; + trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); + ctx.Wait(); + Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); + Free(gpu0, gpu_buf); + for (int i = 0; i < 4; ++i) { + ASSERT_EQ((i + 1) * (i + 1), buf[i]); + } +} diff --git a/paddle/platform/variant.h b/paddle/fluid/platform/variant.h similarity index 100% rename from paddle/platform/variant.h rename to paddle/fluid/platform/variant.h diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115 --- /dev/null +++ b/paddle/fluid/pybind/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d62f34030894e2fa21925bbc44e24b4e7d738d15 --- /dev/null +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -0,0 +1,9 @@ +if(WITH_PYTHON) + cc_library(paddle_pybind SHARED + SRCS pybind.cc exception.cc protobuf.cc const_value.cc + DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + ${GLOB_OP_LIB}) + if(NOT APPLE AND NOT ANDROID) + target_link_libraries(paddle_pybind rt) + endif(NOT APPLE AND NOT ANDROID) +endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc new file mode 100644 index 0000000000000000000000000000000000000000..098252a83d3b7e2926bf737ce7f2b3794046f28f --- /dev/null +++ b/paddle/fluid/pybind/const_value.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "const_value.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace pybind { + +void BindConstValue(pybind11::module& m) { + m.def("kEmptyVarName", [] { return framework::kEmptyVarName; }); + m.def("kTempVarName", [] { return framework::kTempVarName; }); + m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); + m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h new file mode 100644 index 0000000000000000000000000000000000000000..67d14ac9ff01d1754dd8dd165b638db12c9d0ea0 --- /dev/null +++ b/paddle/fluid/pybind/const_value.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "pybind11/pybind11.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +extern void BindConstValue(pybind11::module& m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc new file mode 100644 index 0000000000000000000000000000000000000000..7398a88541bcbf338ca9568595d0dc7b16eff118 --- /dev/null +++ b/paddle/fluid/pybind/exception.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/exception.h" + +namespace paddle { +namespace pybind { + +void BindException(pybind11::module& m) { + static pybind11::exception exc(m, "EnforceNotMet"); + pybind11::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const platform::EnforceNotMet& e) { + exc(e.what()); + } + }); + + m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h new file mode 100644 index 0000000000000000000000000000000000000000..43e91a706300e561a20d88abe80d9b6654bd2171 --- /dev/null +++ b/paddle/fluid/pybind/exception.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" +#include "pybind11/pybind11.h" +namespace paddle { +namespace pybind { + +extern void BindException(pybind11::module& m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc new file mode 100644 index 0000000000000000000000000000000000000000..f6c100eed7a2ea2d45f44271915febb3d510165e --- /dev/null +++ b/paddle/fluid/pybind/protobuf.cc @@ -0,0 +1,305 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/protobuf.h" +#include +#include +#include "paddle/fluid/framework/backward.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +// Cast boost::variant for PyBind. +// Copy from +// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 +namespace pybind11 { +namespace detail { + +// Can be replaced by a generic lambda in C++14 +struct variant_caster_visitor : public boost::static_visitor { + return_value_policy policy; + handle parent; + + variant_caster_visitor(return_value_policy policy, handle parent) + : policy(policy), parent(parent) {} + + template + handle operator()(T const &src) const { + return make_caster::cast(src, policy, parent); + } +}; + +template +struct variant_caster; + +template