diff --git a/doc/design/block.md b/doc/design/block.md index 7cbf0d55b1faeb2093ee7cf234d1c2ad1905885b..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644 --- a/doc/design/block.md +++ b/doc/design/block.md @@ -189,7 +189,7 @@ OpDesc { inputs = {0} // the index of x in vars of BlockDesc above outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above attrs { - "memories" : {1} // the index of h + "states" : {1} // the index of h "step_net" : } }; diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt index 2c458a78c598bf206b30c0c07599ce605af77701..e767856d5012fd205f6b57f9721d0cbca8dc46ed 100644 --- a/paddle/capi/CMakeLists.txt +++ b/paddle/capi/CMakeLists.txt @@ -28,23 +28,37 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} add_dependencies(paddle_capi paddle_proto) -# combine all paddle static libraries together, into libpaddle_capi_whole.a -# user should use PaddleCAPI as -lpaddle_capi_whole -set(PADDLE_CAPI_INFER_LIBS - paddle_utils - paddle_parameter - paddle_math - paddle_cuda - paddle_function - paddle_gserver - paddle_proto) - +# TODO: paddle_capi_whole will be removed. +if(MOBILE_INFERENCE) + set(PADDLE_CAPI_INFER_LIBS + paddle_utils + paddle_parameter + paddle_math + paddle_cuda + paddle_function + paddle_gserver + paddle_proto) +else() + set(PADDLE_CAPI_INFER_LIBS + paddle_utils + paddle_parameter + paddle_math + paddle_cuda + paddle_function + paddle_gserver + paddle_proto + paddle_pserver + paddle_network) +endif() cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS}) -# No shared library for iOS +# Link the static library for inference +cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto) +cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver) + +# Link the shared library for inference if(NOT IOS) - set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map") - # TODO: merge mkl into paddle_capi_shared + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map") add_library(paddle_capi_shared SHARED ${CAPI_SOURCES}) set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) @@ -53,9 +67,10 @@ endif() # install library & headers. install(FILES ${CAPI_HEADERS} DESTINATION include/paddle) +install(FILES paddle_capi.map DESTINATION include/paddle) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle) if(ANDROID) - install(TARGETS paddle_capi_whole paddle_capi_shared + install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared ARCHIVE DESTINATION lib/${ANDROID_ABI} LIBRARY DESTINATION lib/${ANDROID_ABI}) execute_process( @@ -80,7 +95,7 @@ if(ANDROID) )" ) else(ANDROID) - install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib) + install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib) if(NOT IOS) install(TARGETS paddle_capi_shared DESTINATION lib) endif() diff --git a/paddle/capi/export.sym b/paddle/capi/export.sym deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/paddle/capi/export.map b/paddle/capi/paddle_capi.map similarity index 100% rename from paddle/capi/export.map rename to paddle/capi/paddle_capi.map diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index fb552fe3448b3f17e97e1262b5c9a0842f68f8b9..1ae7fb60f01e4925ceb310f661171eb231eb6c96 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -21,6 +21,7 @@ #include "paddle/framework/block_desc.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/dynamic_recurrent_op.h" #include "paddle/operators/net_op.h" #include "paddle/operators/recurrent_op.h" @@ -220,8 +221,7 @@ static std::unique_ptr BackwardRecursive( // process recurrent gradient op as a special operator. if (forwardOp.Type() == "recurrent") { // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), - // or - // this will result in infinite loop. + // or this will result in infinite loop. const auto& rnnop = *static_cast(&forwardOp); auto rnn_grad_op = @@ -231,6 +231,18 @@ static std::unique_ptr BackwardRecursive( // create stepnet's gradient op rnn_grad_op->set_stepnet( BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); + } else if (forwardOp.Type() == "dynamic_recurrent") { + // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), + // or this will result in infinite loop. + const auto& rnnop = + *static_cast(&forwardOp); + auto rnn_grad_op = + static_cast(grad_op.get()); + const auto& stepnet_op = + *static_cast(&rnnop.rnn.GetStepUnit()); + // create stepnet's gradient op + rnn_grad_op->rnn.SetStepUnit( + BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); } if (net->ops_.empty()) { // Current no aux op is added to network diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h index 649899d42572c9a22adca5337dcd56b0bcf42e7c..c25a62c2b11ead614d93a4be8d63d40d0cc0165a 100644 --- a/paddle/framework/data_type.h +++ b/paddle/framework/data_type.h @@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) { return DataType::FP64; } else if (typeid(int).hash_code() == type.hash_code()) { return DataType::INT32; + } else if (typeid(int64_t).hash_code() == type.hash_code()) { + return DataType::INT64; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index d50f0da03245783f8f0de481d7be0699fd10feac..1f1e4edda823d62b169422672c855d96a2bd2ede 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -84,8 +84,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { op->Run(local_scope, *device); } - // TODO(tonyyang-svail): - // - Destroy local_scope + scope->DeleteScope(&local_scope); } } // namespace framework diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h index 9b23ad271cb3782794f624cb17eaf28fd3ca801a..7feacb1e24708411e7fbb610f9909447cba9e291 100644 --- a/paddle/framework/feed_fetch_method.h +++ b/paddle/framework/feed_fetch_method.h @@ -21,28 +21,28 @@ limitations under the License. */ namespace paddle { namespace framework { -template -void SetFeedVariable(const LoDTensor& input, const std::string& var_name, - size_t index) { +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index) { // If var_name Variable is not found in GlobalScope, a new variable will // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; - Variable* g_feed_value = GetGlobalScope().Var(var_name); + Variable* g_feed_value = scope->Var(var_name); auto& feed_inputs = *(g_feed_value->GetMutable>()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + feed_inputs[index].ShareDataWith(input); // set lod feed_inputs[index].set_lod(input.lod()); } -LoDTensor& GetFetchVariable(const std::string& var_name, size_t index) { +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index) { // Since we want to fetch LodTensor from a variable, the variable must // be created alreadly. - Variable* g_fetch_value = GetGlobalScope().FindVar(var_name); + Variable* g_fetch_value = scope.FindVar(var_name); PADDLE_ENFORCE(g_fetch_value->IsType(), "Only %s can be invoked by GetFetchVariable", typeid(FeedFetchList).name()); diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 5b7badf89c1714331bae9fc8cf94c8da2c66dbad..7c0ea0df7829883ccb36772634263cd33ff32e1d 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { for (size_t i = level_begin; i < level_end; i++) { new_lod.emplace_back(in.at(i)); } + // transform the lowest level to absolute offset. + LoD abs_offset_lod = ToAbsOffset(in); + new_lod.back() = abs_offset_lod[level_end - 1]; return new_lod; } LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, size_t elem_end) { - // slice the lod. - LoD new_lod; - new_lod.reserve(in.size() - level); - auto start = in.at(level)[elem_begin]; - auto end = in.at(level)[elem_end]; - - for (auto it = in.begin() + level; it != in.end(); it++) { - auto it_begin = std::find(it->begin(), it->end(), start); - auto it_end = std::find(it_begin, it->end(), end); - PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info"); - PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info"); - new_lod.emplace_back(it_begin, it_end + 1); - // reset offset if tensor is copyed and sliced. - std::transform(new_lod.back().begin(), new_lod.back().end(), - new_lod.back().begin(), - [start](int v) { return v - start; }); - PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD"); + PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_end, in[level].size()); + + LoD res; + res.resize(in.size() - level); + // copy the first level + res[0].assign(in[level].begin() + elem_begin, + in[level].begin() + elem_end + 1); + for (size_t lvl = 1; lvl < res.size(); lvl++) { + const auto& in_level = in[level + lvl]; + const auto& above_level = res[lvl - 1]; + auto& out_level = res[lvl]; + out_level.assign(in_level.begin() + above_level.front(), + in_level.begin() + above_level.back() + 1); } - PADDLE_ENFORCE_LE(new_lod.size(), in.size()); - return new_lod; + for (size_t lvl = 0; lvl < res.size(); lvl++) { + // to make the first offset equals 0, all the elements minus the first + // element + size_t front = res[lvl].front(); + for (auto& ele : res[lvl]) { + ele -= front; + } + } + return res; +} + +LoD ToAbsOffset(const LoD& in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (int level = result.size() - 2; level >= 0; level--) { + for (auto& ele : result[level]) { + ele = result[level + 1][ele]; + } + } + return result; } bool operator==(const LoD& a, const LoD& b) { @@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) { size_t LoDTensor::NumElements(size_t level, size_t idx) const { PADDLE_ENFORCE_LT(level, NumLevels()); PADDLE_ENFORCE_LT(idx, NumElements(level)); - // the last level of LoD, just return number of records in Tensor - if (level == NumLevels() - 1) { - return lod_[level][idx + 1] - lod_[level][idx]; - } - // high level of LoD, and there is another lower level, return number of - // lower-level elements - auto tmp = SliceInLevel(lod_, level, idx, idx + 1); - PADDLE_ENFORCE_GE(tmp.size(), 2); - // there is a 0 as a placeholder stored in LoD, so the number of elements - // equals lod.size() - 1 - return tmp[1].size() - 1; + return lod_[level][idx + 1] - lod_[level][idx]; } void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) { diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 3d893baa35391d38372c735ad62576f3dc35a99b..dec59a5750ab24244a013282b4547fb18d4991ac 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -39,23 +39,36 @@ using Vector = thrust::host_vector< #endif /* - * 3-level LoD stores + * LoD is short for Level of Details. * - * 0 10 20 - * 0 5 10 15 20 - * 0 2 5 7 10 12 15 20 - * - * - in a level, each element indicates offset in the underlying Tensor + * - in a level, each element indicates relative offset of the lower level * - the first element should be 0 and that indicates that this sequence start * from 0 * - each sequence's begin and end(no-inclusive) is level[id, id+1] + * + * For example: + * 3-level LoD stores + * + * 0 2 3 + * 0 2 4 7 + * 0 2 5 7 10 12 15 20 */ using LoD = std::vector>; +/* + * Slice levels from a LoD. + * NOTE the lowest level should always be the absolute offsets of the underlying + * tensor instances. So if higher layers are sliced without the lowest level, + * the lower level of the sliced LoD will be transformed to the absolute offset. + */ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end); LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, size_t elem_end); +/* + * Transform an LoD from relative offsets to absolute offsets. + */ +LoD ToAbsOffset(const LoD& in); bool operator==(const LoD& a, const LoD& b); diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc index 44f09f584fb752d7003baa804979f3bb5cd9d651..e1e15abecf5534fb4fd94f7e2b65230c74d175de 100644 --- a/paddle/framework/lod_tensor_test.cc +++ b/paddle/framework/lod_tensor_test.cc @@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test { // 0 5 10 15 20 // 0 2 5 7 10 12 15 20 LoD lod; - lod.push_back(std::vector{0, 10, 20}); - lod.push_back(std::vector{0, 5, 10, 15, 20}); + lod.push_back(std::vector{0, 2, 3}); + lod.push_back(std::vector{0, 2, 5, 8}); lod.push_back(std::vector{0, 2, 5, 7, 10, 12, 15, 17, 20}); ASSERT_EQ(lod.size(), 3UL); @@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); } TEST_F(LoDTensorTester, NumElements) { ASSERT_EQ(lod_tensor_.NumElements(0), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(1), 4UL); + ASSERT_EQ(lod_tensor_.NumElements(1), 3UL); ASSERT_EQ(lod_tensor_.NumElements(2), 8UL); } TEST_F(LoDTensorTester, NumElements2) { ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL); + ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL); + ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL); } TEST_F(LoDTensorTester, ShrinkLevels) { @@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) { LoDTensor new_lod_tensor = lod_tensor_; new_lod_tensor.ShrinkLevels(level, level + 1); ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level)); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } // shrink 2 level for (size_t level = 0; level < 2UL; ++level) { LoDTensor new_lod_tensor = lod_tensor_; new_lod_tensor.ShrinkLevels(level, level + 2); + // the lowest level's last element should be the tensor's batch_size. + ASSERT_EQ(new_lod_tensor.lod().back().back(), + lod_tensor_.lod().back().back()); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level)); - ASSERT_EQ(new_lod_tensor.NumElements(1), - lod_tensor_.NumElements(level + 1)); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } } @@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) { TEST_F(LoDTensorTester, ShrinkInLevel) { size_t level = 0; LoDTensor new_lod_tensor = lod_tensor_; - new_lod_tensor.ShrinkInLevel(level, 0, 2); + new_lod_tensor.ShrinkInLevel(level, 0, 1); EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL); - EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL); - EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL); - EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL); + EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL); + EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL); + EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); level = 1; new_lod_tensor = lod_tensor_; - new_lod_tensor.ShrinkInLevel(level, 0, 2); + new_lod_tensor.ShrinkInLevel(level, 1, 2); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL); + ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL); + ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 5bf5e91f25ab1d920ae368eaf2000fce77d2eb07..ac3ac649f96c492852a3bd69be69487736a4ddd7 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -65,12 +65,11 @@ void Scope::DropKids() { kids_.clear(); } -framework::Scope& GetGlobalScope() { - static framework::Scope* g_scope = nullptr; - if (g_scope == nullptr) { - g_scope = new framework::Scope(); - } - return *g_scope; +void Scope::DeleteScope(Scope* scope) { + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); + this->kids_.erase(it); + delete scope; } } // namespace framework diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index a7fce3514b163d78bf96b3cc19d188744a383395..7206b53068bac3e16db385abc76359dc45a582df 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -59,6 +59,8 @@ class Scope { /// Find the scope or an ancestor scope that contains the given variable. const Scope* FindScope(const Variable* var) const; + void DeleteScope(Scope* scope); + /// Drop all kids scopes belonged to this scope. void DropKids(); @@ -72,8 +74,5 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); }; - -framework::Scope& GetGlobalScope(); - } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index bc430852de6384ce8a02780d4e90787d58f5574c..3a2bdaf086372d5d0b07cf260feb2ee6f3cfb508 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -60,6 +60,10 @@ class Tensor { template inline T* mutable_data(platform::Place place); + inline void* mutable_data(platform::Place place, std::type_index type); + + inline void* mutable_data(platform::Place place); + /** * @brief Return a pointer to mutable memory block. * @@ -81,7 +85,6 @@ class Tensor { inline Tensor& Resize(const DDim& dims); /*! The internal of two tensors share the same memory block. */ - template inline Tensor& ShareDataWith(const Tensor& src); /** @@ -96,26 +99,9 @@ class Tensor { // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647 // Remove `CopyFrom` and `CopyFromVector` from Tensor interface // and make them global functions - template inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx); - // FIXME(yuyang18): CopyFrom should without template T, use the replace - // `CopyFrom` with `CopyFromTensor` - inline void CopyFromTensor(const Tensor& src, - const platform::Place& dst_place, - const platform::DeviceContext& ctx) { - // NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions. - // That is a bug of cpplint.py. Just ignore lint these lines. - if (src.type() == std::type_index(typeid(double))) { - CopyFrom(src, dst_place, ctx); - } else if (src.type() == std::type_index(typeid(float))) { - CopyFrom(src, dst_place, ctx); - } else if (src.type() == std::type_index(typeid(int))) { - CopyFrom(src, dst_place, ctx); - } - } - /** * @brief Copy the content of an external vector to a tensor. * @@ -135,7 +121,6 @@ class Tensor { * @param[in] begin_idx The begin index of the slice. * @param[in] end_idx The end index of the slice. */ - template inline Tensor Slice(const int& begin_idx, const int& end_idx) const; platform::Place place() const { @@ -146,7 +131,6 @@ class Tensor { std::type_index type() const { return holder_->type(); } private: - template inline void check_memory_size() const; private: @@ -155,20 +139,22 @@ class Tensor { * parameter of Variable. */ struct Placeholder { - virtual ~Placeholder() {} + virtual ~Placeholder() = default; virtual void* ptr() const = 0; virtual size_t size() const = 0; virtual std::type_index type() const = 0; virtual platform::Place place() const = 0; + virtual void set_type(std::type_index type) = 0; }; - template + template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size) - : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + PlaceholderImpl(Place place, size_t size, std::type_index type) + : ptr_(static_cast(memory::Alloc(place, size)), + memory::PODDeleter(place)), place_(place), - size_(size) { + size_(size), + type_(type) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -176,16 +162,20 @@ class Tensor { virtual size_t size() const { return size_; } virtual platform::Place place() const { return place_; } virtual void* ptr() const { return static_cast(ptr_.get()); } - virtual std::type_index type() const { return std::type_index(typeid(T)); } + virtual std::type_index type() const { return type_; } + virtual void set_type(std::type_index type) { type_ = type; } /*! the pointer of memory block. */ - std::unique_ptr> ptr_; + std::unique_ptr> ptr_; /*! the place of memory block. */ platform::Place place_; /*! the size of memory block. */ size_t size_; + + /* the current type of memory */ + std::type_index type_; }; /*! holds the memory block if allocated. */ diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc index 06459cbfd7b8c19c176452ff73c9f3a81ba1dc03..4c82c3638351c41df26503e2a26b5a4bb5822a67 100644 --- a/paddle/framework/tensor_array.cc +++ b/paddle/framework/tensor_array.cc @@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) { values_[index].Resize(value.dims()); values_[index].mutable_data(platform::CPUPlace()); - values_[index].CopyFrom(value, platform::CPUPlace(), - platform::CPUDeviceContext()); + values_[index].CopyFrom(value, platform::CPUPlace(), + platform::CPUDeviceContext()); } void TensorArray::WriteShared(size_t index, const LoDTensor& value) { @@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) { values_.resize(index + 1); } - values_[index].ShareDataWith(value); + values_[index].ShareDataWith(value); } LoDTensor TensorArray::Pack(size_t level, const std::vector& meta, @@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const { result.mutable_data(platform::CPUPlace()); for (size_t idx = 0; idx < size(); idx++) { - result.Slice(idx, idx + 1) - .CopyFrom(Read(idx), platform::CPUPlace(), - platform::CPUDeviceContext()); + result.Slice(idx, idx + 1) + .CopyFrom(Read(idx), platform::CPUPlace(), + platform::CPUDeviceContext()); } return result; } @@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const { auto& value = values_[elem]; if (data_shared) { // share memory - value.ShareDataWith(source.Slice(elem, elem + 1)); + value.ShareDataWith(source.Slice(elem, elem + 1)); } else { // copy value.Resize(value_dims); - value.CopyFrom(source.Slice(elem, elem + 1), - platform::CPUPlace(), - platform::CPUDeviceContext()); + value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(), + platform::CPUDeviceContext()); } } } @@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { for (size_t i = 0; i < indice.size(); i++) { auto index = indice[i]; - auto target = result.Slice(i, i + 1); - auto slice = source->Slice(index, index + 1); + auto target = result.Slice(i, i + 1); + auto slice = source->Slice(index, index + 1); - target.CopyFrom(slice, platform::CPUPlace(), - platform::CPUDeviceContext()); + target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext()); } return result; @@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector& source, // target is result[index] auto index = seq_meta.begin + batch_id; if (index >= seq_meta.end) break; - auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); - auto target = result.Slice(index, index + 1); - target.CopyFrom(source_, platform::CPUPlace(), - platform::CPUDeviceContext()); + auto source_ = source[batch_id].Slice(seq_id, seq_id + 1); + auto target = result.Slice(index, index + 1); + target.CopyFrom(source_, platform::CPUPlace(), + platform::CPUDeviceContext()); } } diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc index d9f52509cdd1b79f6d53b5d4922f9e44279de08b..9470ac5e6ed714d5ba63f3743e683af7f8edd4b0 100644 --- a/paddle/framework/tensor_array_test.cc +++ b/paddle/framework/tensor_array_test.cc @@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test { size_t begin = level[i]; size_t end = level[i + 1]; for (size_t j = begin; j < end; j++) { - auto record = source.Slice(j, j + 1); + auto record = source.Slice(j, j + 1); for (int dim = 0; dim < 128; dim++) { record.mutable_data(platform::CPUPlace())[dim] = j - begin; } diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index ce73e0a9edbe340f1165e2dbcba8c976c55df348..f6e801bbb4a056b5590da95a4b140cb90638f322 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -19,12 +19,50 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct SizeOfTypeFunctor; + template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor functor; + size_t size = functor(type); + PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + inline void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); PADDLE_ENFORCE_GE( - holder_->size(), numel() * sizeof(T) + offset_, + holder_->size(), numel() * SizeOfType(type()) + offset_, "Tensor's dims_ is out of bound. Call Tensor::mutable_data " "first to re-allocate memory.\n" "or maybe the required data-type mismatches the data already stored."); @@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const { template inline const T* Tensor::data() const { - check_memory_size(); + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); + return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); } template inline T* Tensor::data() { - check_memory_size(); + check_memory_size(); + PADDLE_ENFORCE(std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code(), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } @@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) { template inline T* Tensor::mutable_data(platform::Place place) { static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(place, typeid(T))); +} + +inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } PADDLE_ENFORCE_GT(numel(), 0, "Tensor's numel must be larger than zero to call " "Tensor::mutable_data. Call Tensor::set_dim first."); + int64_t size = numel() * SizeOfType(type); /* some versions of boost::variant don't have operator!= */ - int64_t size = numel() * sizeof(T); if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } #else - holder_.reset(new PlaceholderImpl( - boost::get(place), size)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } #endif offset_ = 0; } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +inline void* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing"); + return mutable_data(place, holder_->type()); } -template inline Tensor& Tensor::ShareDataWith(const Tensor& src) { - src.check_memory_size(); + src.check_memory_size(); *this = src; return *this; } -template inline void Tensor::CopyFrom(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx) { - src.check_memory_size(); + src.check_memory_size(); Resize(src.dims()); auto src_place = src.holder_->place(); - auto src_ptr = static_cast(src.data()); + auto src_ptr = src.data(); - auto dst_ptr = static_cast(mutable_data(dst_place)); + auto dst_ptr = mutable_data(dst_place, src.type()); - auto size = src.numel() * sizeof(T); + auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, @@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector& src, #endif } -template inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { - check_memory_size(); + check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero."); PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound."); PADDLE_ENFORCE_LT(begin_idx, end_idx, @@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { DDim dst_dims = dims_; dst_dims[0] = end_idx - begin_idx; dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * sizeof(T); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); return dst; } } @@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } -template inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; - res.ShareDataWith(src); + res.ShareDataWith(src); res.Resize(flatten_to_2d(src.dims(), num_col_dims)); return res; } diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 0b62fe08ce9e592384e55432861a943403453bb7..1bb0fb71b079940d35a995b78e04a531c074a8b2 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) { // Try to share data form uninitialized tensor bool caught = false; try { - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); } catch (paddle::platform::EnforceNotMet err) { caught = true; std::string msg = @@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_TRUE(caught); src_tensor.mutable_data(make_ddim({2, 3, 4}), CPUPlace()); - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) { Tensor src_tensor; Tensor dst_tensor; src_tensor.mutable_data(make_ddim({2, 3, 4}), GPUPlace()); - dst_tensor.ShareDataWith(src_tensor); + dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } #endif @@ -143,7 +143,7 @@ TEST(Tensor, Slice) { { Tensor src_tensor; src_tensor.mutable_data(make_ddim({5, 3, 4}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 3); + Tensor slice_tensor = src_tensor.Slice(1, 3); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 3); EXPECT_EQ(slice_dims[0], 2); @@ -167,7 +167,7 @@ TEST(Tensor, Slice) { { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); - Tensor slice_tensor = src_tensor.Slice(2, 6); + Tensor slice_tensor = src_tensor.Slice(2, 6); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); EXPECT_EQ(slice_dims[0], 4); @@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) { memcpy(src_ptr, arr, 9 * sizeof(int)); auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); + dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); @@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); @@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) { // CPU Tensor to GPU Tensor auto gpu_place = new paddle::platform::GPUPlace(0); CUDADeviceContext gpu_ctx(*gpu_place); - gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); + gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor auto cpu_place = new paddle::platform::CPUPlace(); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } - Tensor slice_tensor = src_tensor.Slice(1, 2); + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor - gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); + gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx); // GPU Tensor to CPU Tensor - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Slice Tensors gpu_ctx.Wait(); @@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) { CUDADeviceContext gpu_ctx(*gpu_place); gpu_tensor.CopyFromVector(src_vec, gpu_ctx); // Copy from GPU to CPU tensor for comparison - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) { cpu_tensor.CopyFromVector(src_vec, cpu_ctx); gpu_tensor.Resize(make_ddim({2, 2})); gpu_tensor.CopyFromVector(src_vec, gpu_ctx); - dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); + dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx); // Sync before Compare Tensors gpu_ctx.Wait(); @@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) { for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { src_ptr[i] = i; } - Tensor res = ReshapeToMatrix(src, 2); + Tensor res = ReshapeToMatrix(src, 2); ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } diff --git a/paddle/gserver/tests/mkldnn_branches_fc.conf b/paddle/gserver/tests/mkldnn_branches_fc.conf new file mode 100644 index 0000000000000000000000000000000000000000..fb85425c2b63c7604d636e2b0c5d20d91fb5de1b --- /dev/null +++ b/paddle/gserver/tests/mkldnn_branches_fc.conf @@ -0,0 +1,58 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +settings(batch_size=16) +channels = get_config_arg("channels", int, 2) + +def two_fc(input, group_name): + out1 = fc_layer(input=input, + name=group_name+'_fc1', + size=channels, + bias_attr=False, + act=LinearActivation()) + + out2 = fc_layer(input=input, + name=group_name+'_fc2', + size=channels, + bias_attr=False, + act=LinearActivation()) + return out1, out2 + +data = data_layer(name ="input", size=channels*16*16) + +conv = img_conv_layer(input=data, + num_channels=channels, + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=LinearActivation()) + +pool = img_pool_layer(input=conv, + pool_size=3, + stride=2, + padding=1, + pool_type=AvgPooling()) + +a1, a2 = two_fc(input=pool, group_name='a') + +concat = concat_layer(input=[a1, a2]) + +b1, b2 = two_fc(input=pool, group_name='b') + +addto = addto_layer(input=[b1, b2]) + +outputs([concat, addto]) diff --git a/paddle/gserver/tests/mkldnn_branches_pool.conf b/paddle/gserver/tests/mkldnn_branches_pool.conf new file mode 100644 index 0000000000000000000000000000000000000000..ca17c74752ab0777a69f818d9f43275a6140cb4c --- /dev/null +++ b/paddle/gserver/tests/mkldnn_branches_pool.conf @@ -0,0 +1,60 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +settings(batch_size=16) +channels = get_config_arg("channels", int, 2) + +def two_pool(input, group_name): + out1 = img_pool_layer(input=input, + name=group_name+'_pool1', + pool_size=3, + stride=2, + padding=0, + pool_type=MaxPooling()) + + out2 = img_pool_layer(input=input, + name=group_name+'_pool2', + pool_size=5, + stride=2, + padding=1, + pool_type=MaxPooling()) + return out1, out2 + +data = data_layer(name ="input", size=channels*16*16) + +conv = img_conv_layer(input=data, + num_channels=channels, + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=LinearActivation()) + +pool = img_pool_layer(input=conv, + pool_size=3, + stride=1, + padding=1, + pool_type=AvgPooling()) + +a1, a2 = two_pool(input=pool, group_name='a') + +concat = concat_layer(input=[a1, a2]) + +b1, b2 = two_pool(input=pool, group_name='b') + +addto = addto_layer(input=[b1, b2]) + +outputs([concat, addto]) diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index 3571fbb9e335fc6652bdbfc3f9e35beabda5044f..6cb4ca5e08eab5b979e404c9e09dcfec11086c22 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -250,7 +250,7 @@ TEST(MKLDNNActivation, Activations) { DECLARE_string(config_args); TEST(MKLDNNLayer, branches) { - std::vector cases = {"conv"}; + std::vector cases = {"conv", "pool", "fc"}; for (auto name : cases) { std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf"; for (auto channels : {2, 32}) { diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index 037bb49abc6c272eed2d27ea5d8425866ef9a1d5..e0a00ecaf04335800eab9e2e5a03628a2ce2ca8d 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker); -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel); +REGISTER_OP_CPU_KERNEL( + accuracy, ops::AccuracyKernel, + ops::AccuracyKernel, + ops::AccuracyKernel, + ops::AccuracyKernel); diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index 0ca9ef941d4cb15619caea2b6baed197e4b15e5a..54e6ab99dc8c8ff1afbc636e6595cd67fb64eccf 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -21,9 +21,9 @@ namespace paddle { namespace operators { using platform::PADDLE_CUDA_NUM_THREADS; -template -__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata, - const int* labeldata, float* accuracy) { +template +__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata, + const T* labeldata, float* accuracy) { int count = 0; __shared__ int total[BlockSize]; @@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { auto* accuracy = ctx.Output("Accuracy"); // FIXME(typhoonzero): only support indices currently // if add support for output values, how to detect the data type? - const int* inference_data = inference->data(); - const int* label_data = label->data(); + const T* inference_data = inference->data(); + const T* label_data = label->data(); float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); size_t num_samples = inference->dims()[0]; @@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { return; } - AccuracyCudaKernel<<< + AccuracyCudaKernel<<< 1, PADDLE_CUDA_NUM_THREADS, 0, reinterpret_cast( ctx.device_context()) @@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(accuracy, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h index bd1734879ef2569bfc7c3bef21677d3b0dc49a78..f629728f68d65ce81b4910cae7f89ab06d6d94b8 100644 --- a/paddle/operators/conv2d_op.h +++ b/paddle/operators/conv2d_op.h @@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel { int in_step = input_channels / groups; int out_step = output_channels / groups; for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); for (int g = 0; g < groups; g++) { // im2col - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], strides[1], paddings[0], paddings[1]); // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); } @@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel { for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - input_grad->Slice(i, i + 1).Resize(input_shape); + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); for (int g = 0; g < groups; g++) { // gemm Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = - filter.Slice(g * out_step, (g + 1) * out_step); + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), filter_slice, true, out_grad_slice, false, T(1.0), &col_matrix, T(0.0)); // col2im Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); col2im(context.device_context(), in_grad_slice, col, strides[0], strides[1], paddings[0], paddings[1]); } @@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel { for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); for (int g = 0; g < groups; g++) { // im2col Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); im2col(context.device_context(), in_slice, col, strides[0], strides[1], paddings[0], paddings[1]); // gemm Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); + filter_grad_.Slice(g * out_step, (g + 1) * out_step); math::matmul(context.device_context(), out_grad_slice, false, col_matrix, true, T(1.0), &filter_grad_slice, T(1.0)); diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc index 03f33e28d49fdaeccb9b6266359e0b41a1cb847f..a0b06ac1dc305bc899f9abaafcc980a6150ecda9 100644 --- a/paddle/operators/dynamic_recurrent_op.cc +++ b/paddle/operators/dynamic_recurrent_op.cc @@ -23,6 +23,7 @@ using framework::Scope; using framework::TensorArray; using framework::LoDTensor; using framework::Variable; +using framework::OperatorBase; using framework::DySeqMetaBatch; namespace detail { @@ -43,72 +44,72 @@ inline void CreateVariables(Scope& scope, * be reordered, but the RNN op should not change the `boot_state` as an input * variable's content. */ -template -inline void ReorderBootState(const DySeqMetaBatch& metas, - const LoDTensor& boot_state, LoDTensor* tensor, - const platform::Place& dst_place) { +inline void ReorderInitialState(const DySeqMetaBatch& metas, + const LoDTensor& boot_state, LoDTensor* tensor, + const platform::Place& dst_place) { for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { - auto slice = tensor->Slice(seq_id, seq_id + 1); + auto slice = tensor->Slice(seq_id, seq_id + 1); auto boot_slice = - boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); + boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); // TODO(superjom) pass in device context as an argument - slice.template CopyFrom(boot_slice, dst_place, - platform::CPUDeviceContext()); + slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext()); } } -} // namespace detail - -class DynamicRecurrentOpProtoAndCheckerMaker - : public framework::OpProtoAndCheckerMaker { - public: - DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - const auto& name = DynamicRecurrentOp::kArgName; - // inputs and outputs stored in proto - AddInput(name.inlinks, - "the inputs that need to be segmented for each step.") - .AsDuplicable(); - AddInput(name.boot_memories, "variables to initialize memories.") - .AsDuplicable(); - - AddOutput(name.outlinks, "the outputs that need to concated for all steps.") - .AsDuplicable(); - AddOutput(name.step_scopes, "step scopes"); - - // Attributes stored in AttributeMap - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); - - AddComment("This is a RNN operator for varience-length sequences."); +inline void RestoreInitialState(const DySeqMetaBatch& metas, + const LoDTensor& tensor, LoDTensor* boot_state, + const platform::Place& dst_place) { + for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { + auto slice = tensor.Slice(seq_id, seq_id + 1); + auto boot_slice = + boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); + boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext()); } -}; +} -void DynamicRecurrentOp::Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const { - cache_.Init(kArgName, *this, scope, &arg_); +} // namespace detail + +// Implementation for forward propagation. +template <> +void RNNAlgorithm::Run( + const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx) { + SetComputeMode(ComputeMode::kForward); + cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); SplitInputs(); CreateScopes(); WriteStepInputs(); InitStates(); WriteStepOutputs(); + RunSteps(); + ConcatOutputs(); +} - // call stepnet in all the time steps - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& step_scope = cache_.GetScope(step); - stepnet_->Run(step_scope, dev_ctx); +// Implementation for backward propagation. +template <> +void RNNAlgorithm::Run( + const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx) { + SetComputeMode(ComputeMode::kBackward); + cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); + SplitInputs(); + WriteStepInputs(); + InitStates(); + WriteStepOutputs(); + RunSteps(); + // copy boot-states' gradients back. + for (const auto& state : arg_.states) { + ExportInitialStateGradient(state); } ConcatOutputs(); } -void DynamicRecurrentOp::SplitInputs() const { +void RNNAlgorithm::SplitInputs() { // TODO(superjom) make level a config // TODO(superjom) check all the inputs has the same LoD int level = 0; - for (const auto& item : cache_.inlinks) { + for (const auto& item : cache_.inputs) { const auto& var = item.second; const auto& tensor = var->Get(); TensorArray& ta = step_inputs_[item.first]; @@ -125,8 +126,8 @@ void DynamicRecurrentOp::SplitInputs() const { } } -void DynamicRecurrentOp::WriteStepInputs() const { - for (const auto& item : cache_.inlinks) { +void RNNAlgorithm::WriteStepInputs() { + for (const auto& item : cache_.inputs) { auto ta_it = step_inputs_.find(item.first); PADDLE_ENFORCE(ta_it != step_inputs_.end(), "step_inputs_ not compatible with memory set"); @@ -138,20 +139,20 @@ void DynamicRecurrentOp::WriteStepInputs() const { if (var == nullptr) { var = step_scope.Var(item.first); } - var->GetMutable()->ShareDataWith(tensor); + var->GetMutable()->ShareDataWith(tensor); } } } -void DynamicRecurrentOp::WriteStepOutputs() const { +void RNNAlgorithm::WriteStepOutputs() { // initialize step outputs - for (const auto& item : cache_.outlinks) { + for (const auto& item : cache_.outputs) { step_outputs_.emplace(item.first, TensorArray()); } PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL); } -void DynamicRecurrentOp::CreateScopes() const { +void RNNAlgorithm::CreateScopes() { PADDLE_ENFORCE_GT(cache_.num_steps, 0); // resize scopes size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size(); @@ -160,19 +161,19 @@ void DynamicRecurrentOp::CreateScopes() const { } // init temporary inputs - PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first"); - std::vector memories; - std::vector pre_memories; - std::vector stepnet_outputs; - std::transform(arg_.memories.begin(), arg_.memories.end(), - std::back_inserter(memories), - [](const rnn::MemoryAttr& m) { return m.var; }); - std::transform(arg_.memories.begin(), arg_.memories.end(), - std::back_inserter(pre_memories), - [](const rnn::MemoryAttr& m) { return m.pre_var; }); - for (const auto& item : stepnet_->Outputs()) { + PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first"); + std::vector states; + std::vector ex_states; + std::vector step_unit_outputs; + std::transform(arg_.states.begin(), arg_.states.end(), + std::back_inserter(states), + [](const rnn::StateAttr& m) { return m.var; }); + std::transform(arg_.states.begin(), arg_.states.end(), + std::back_inserter(ex_states), + [](const rnn::StateAttr& m) { return m.pre_var; }); + for (const auto& item : step_unit_->Outputs()) { for (const auto& var : item.second) { - stepnet_outputs.push_back(var); + step_unit_outputs.push_back(var); } } @@ -180,13 +181,13 @@ void DynamicRecurrentOp::CreateScopes() const { auto& scope = cache_.GetScope(step); detail::CreateVariables(scope, arg_.inlinks); detail::CreateVariables(scope, arg_.outlinks); - detail::CreateVariables(scope, memories); - detail::CreateVariables(scope, pre_memories); - detail::CreateVariables(scope, stepnet_outputs); + detail::CreateVariables(scope, states); + detail::CreateVariables(scope, ex_states); + detail::CreateVariables(scope, step_unit_outputs); } } -void DynamicRecurrentOp::ConcatOutputs() const { +void RNNAlgorithm::ConcatOutputs() { // TODO(superjom) transform this to a config int level = 0; for (size_t step = 0; step < cache_.num_steps; step++) { @@ -199,31 +200,45 @@ void DynamicRecurrentOp::ConcatOutputs() const { item.second.WriteShared(step, *tensor); } } - // the inlinks' lods should be the same, so randomly get one lod. + // the inputs' lods should be the same, so randomly get one lod. const auto& some_lod = cache_.scope->FindVar(arg_.inlinks.front())->Get().lod(); const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; for (auto& item : step_outputs_) { auto tensor = item.second.Pack(level, some_meta, some_lod); - auto* output = cache_.outlinks[item.first]->GetMutable(); - const_cast(output)->ShareDataWith(tensor); + auto* output = cache_.outputs[item.first]->GetMutable(); + const_cast(output)->ShareDataWith(tensor); + } +} + +void RNNAlgorithm::RunSteps() { + if (IsBackward()) { + // call stepnet in all the time steps reversely + for (int step = cache_.num_steps - 1; step >= 0; step--) { + auto& step_scope = cache_.GetScope(step); + step_unit_->Run(step_scope, *cache_.dev_ctx); + } + } else { + for (size_t step = 0; step < cache_.num_steps; step++) { + auto& step_scope = cache_.GetScope(step); + step_unit_->Run(step_scope, *cache_.dev_ctx); + } } } -void DynamicRecurrentOp::InitStates() const { +void RNNAlgorithm::InitStates() { for (size_t step = 0; step < cache_.num_steps; step++) { - for (const auto& memory : arg_.memories) { - CreateState(memory, step); - LinkState(memory, step); + for (const auto& state : arg_.states) { + CreateState(state, step); + LinkState(state, step); } } } -void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory, - size_t step) const { +void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) { auto& scope = cache_.GetScope(step); - auto& state = *cache_.GetTensor(scope, memory.var); - auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var); + auto& state = *cache_.GetTensor(scope, state_attr.var); + auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var); size_t num_instances = step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; @@ -232,56 +247,79 @@ void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory, state.Resize(dims); state.mutable_data(platform::CPUPlace()); - states_[memory.var].WriteShared(step, state); + states_[state_attr.var].WriteShared(step, state); } -void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory, - size_t step) const { +void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) { auto& scope = cache_.GetScope(step); - auto& state_pre = *cache_.GetTensor(scope, memory.pre_var); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + + // process the first state's boot-state(the 0-step in forward mode or the + // last step in backward mode) + // Only forward mode need to link the boot-state to the `pre-state` in first + // time step. In backward mode, need to copy the gradient of `pre-state` in + // first time step to the gradient of `boot-state`. + if (step == 0 && IsForward()) { + LinkInitialState(state); + } else { + size_t num_instances = + step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var); + // shink and share from previous state + auto shrinked_pre_state = pre_state->Slice(0, num_instances); + state_pre.ShareDataWith(shrinked_pre_state); + } +} +void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) { // all the step_inputs' metas should be the same, just randomly select one // and get the dyseq meta. const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; - size_t num_instances = - step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + auto& scope = cache_.GetScope(0); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var); + pre_state->mutable_data(platform::CPUPlace()); + // allocate state + state_pre.Resize(pre_state->dims()); + state_pre.mutable_data(platform::CPUPlace()); + detail::ReorderInitialState(some_meta, *pre_state, &state_pre, + pre_state->place()); +} - LoDTensor* pre_state{nullptr}; - if (step == 0) { - pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var); - pre_state->mutable_data(platform::CPUPlace()); - // allocate memory - state_pre.Resize(pre_state->dims()); - state_pre.mutable_data(platform::CPUPlace()); - detail::ReorderBootState(some_meta, *pre_state, &state_pre, - pre_state->place()); - } else { - pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var); - } +void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) { + // all the step_inputs' metas should be the same, just randomly select one + // and get the dyseq meta. + const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; + auto& scope = cache_.GetScope(0); - // shink and share from previous state - auto shrinked_pre_state = pre_state->Slice(0, num_instances); - state_pre.ShareDataWith(shrinked_pre_state); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var); + pre_state.Resize(state_pre.dims()); + detail::RestoreInitialState(some_meta, state_pre, &pre_state, + pre_state.place()); } -void DynamicRecurrentOp::ArgCache::Init( - const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op, - const paddle::framework::Scope& scope, rnn::Argument* arg) { +void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name, + const paddle::framework::OperatorBase& op, + const paddle::framework::Scope& scope, + platform::DeviceContext const* dev_ctx, + rnn::Argument* arg) { this->scope = &scope; InitArgument(name, op, arg); CacheScopes(scope, *arg); CacheInlinks(scope, arg->inlinks); CacheOutlinks(scope, arg->outlinks); + this->dev_ctx = dev_ctx; } -void DynamicRecurrentOp::ArgCache::InitArgument(const rnn::ArgumentName& name, - const OperatorBase& op, - rnn::Argument* arg) { +void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name, + const OperatorBase& op, + rnn::Argument* arg) { rnn::InitArgument(name, arg, op, false /*is_grad*/); } -void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope, - const rnn::Argument& arg) { +void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope, + const rnn::Argument& arg) { auto scopes_var = scope.FindVar(arg.step_scopes); PADDLE_ENFORCE(scopes_var != nullptr, "the step_scopes output argument [%s] should be created first " @@ -290,45 +328,85 @@ void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope, this->scopes = scopes_var->GetMutable>(); } -void DynamicRecurrentOp::ArgCache::CacheInlinks( +void RNNAlgorithm::ArgCache::CacheInlinks( const Scope& scope, const std::vector& names) { for (auto name : names) { auto* var = GetVariable(scope, name); - inlinks[name] = var; + inputs[name] = var; } } -void DynamicRecurrentOp::ArgCache::CacheOutlinks( +void RNNAlgorithm::ArgCache::CacheOutlinks( const Scope& scope, const std::vector& names) { for (auto name : names) { auto* var = GetVariable(scope, name); - outlinks[name] = var; + outputs[name] = var; } } -Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope, - const std::string& name) { +Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope, + const std::string& name) { auto* var = scope.FindVar(name); PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name); return var; } -LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor( - const framework::Scope& scope, const std::string& name) { +LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope, + const std::string& name) { auto* var = GetVariable(scope, name); return var->GetMutable(); } -const rnn::ArgumentName DynamicRecurrentOp::kArgName{ - "step_net", "step_scopes", "inlinks", "outlinks", - "memories", "pre_memories", "boot_memories"}; +const std::array RNNAlgorithm::kArgNames{ + {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs", + "states", "ex_states", "initial_states"}, + rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD", + "inputs@GRAD", "states", "ex_states", + "initial_states@GRAD"}}}; + +void DynamicRecurrentOp::Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + rnn.Run( + scope, *dynamic_cast(this), dev_ctx); +} void DynamicRecurrentGradientOp::Run( - const Scope& scope, const platform::DeviceContext& dev_ctx) const {} + const Scope& scope, const platform::DeviceContext& dev_ctx) const { + rnn.Run( + scope, *dynamic_cast(this), dev_ctx); +} + +class DynamicRecurrentOpProtoAndCheckerMaker + : public framework::OpProtoAndCheckerMaker { + public: + DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = + RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward]; + // inputs and outputs stored in proto + AddInput(name.inlinks, + "the inputs that need to be segmented for each step.") + .AsDuplicable(); + AddInput(name.initial_states, "variables to initialize states.") + .AsDuplicable(); + + AddOutput(name.outlinks, "the outputs that need to concated for all steps.") + .AsDuplicable(); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.ex_states, "names of ex_states"); + AddAttr>(name.states, "names of states"); + + AddComment("This is a RNN operator for varience-length sequences."); + } +}; } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( - dynamic_recurrent, paddle::operators::DynamicRecurrentOp, - paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker); +REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp, + paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker, + dynamic_recurrent_grad, + paddle::operators::DynamicRecurrentGradientOp); diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h index ec80a1c90eee3a655febe0dd3d6c67c16ec6c64b..5b0548c3a44c9f58838ecc567ee41a587883c26a 100644 --- a/paddle/operators/dynamic_recurrent_op.h +++ b/paddle/operators/dynamic_recurrent_op.h @@ -27,47 +27,39 @@ namespace paddle { namespace operators { -class DynamicRecurrentOp : public framework::OperatorBase { +class RNNAlgorithm { public: - static const rnn::ArgumentName kArgName; + enum ComputeMode { kForward = 0, kBackward = 1 }; + static const std::array kArgNames; using value_type = float; - DynamicRecurrentOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - DynamicRecurrentOp(const DynamicRecurrentOp& o) - : framework::OperatorBase( - static_cast(o)) { - // TODO(yuyang18): Implement copy ctor well. - PADDLE_THROW("Not implemented"); - } - - void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override; - + /* + * Different `Run` method for forward and backward, `_` is just for template + * specifialization. + */ + template + void Run(const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx); /* * Split the inputs(LoDTensors) to segments for each time step. */ - void SplitInputs() const; + void SplitInputs(); /* * Create step-scopes to store temporary outputs in each time steps. */ - void CreateScopes() const; + void CreateScopes(); /* * Link TensorArray steps to the corresponding variables located in * step-scopes. */ - void WriteStepInputs() const; + void WriteStepInputs(); /* * Write output of each step to the corresponding TensorArray. */ - void WriteStepOutputs() const; + void WriteStepOutputs(); /* * Initialize the states, each state will have a corresponding pre-state, @@ -75,54 +67,83 @@ class DynamicRecurrentOp : public framework::OperatorBase { * pre-state in the first time step will be initialized with an zero tensor or * a tensor in parent scope if is provided. */ - void InitStates() const; + void InitStates(); /* * Create state variables for each time step. */ - void CreateState(const rnn::MemoryAttr& memory, size_t step) const; + void CreateState(const rnn::StateAttr& state, size_t step); /* * Link pre-state variable in current scope to the state variable in the - * previous time step (scope). + * previous time step (scope) by reference. + */ + void LinkState(const rnn::StateAttr& state, size_t step); + + /* + * Link the pre-state of the first time step to the `boot-state` in parent's + * scope. + */ + void LinkInitialState(const rnn::StateAttr& state); + + /* + * Copy the gradient from `pre-state` in the first step-scope to the + * `boot-state` in parent's scope. + */ + void ExportInitialStateGradient(const rnn::StateAttr& state); + + /* + * Calculate time steps. */ - void LinkState(const rnn::MemoryAttr& memory, size_t step) const; + void RunSteps(); /* * Concatenate outputs in each time step and generate a LoDTensor. */ - void ConcatOutputs() const; + void ConcatOutputs(); + + void SetComputeMode(ComputeMode mode) { mode_ = mode; } + bool IsForward() const { return mode_ == ComputeMode::kForward; } + bool IsBackward() const { return mode_ == ComputeMode::kBackward; } /* - * set a stepnet that is created according to a RecurrentOp's stepnet. + * set a step unit that is created according to a RecurrentOp's step unit. */ - void SetStepNet(std::unique_ptr net) { - PADDLE_ENFORCE_NOT_NULL(net); - stepnet_ = std::move(net); + void SetStepUnit(std::unique_ptr step_unit) { + PADDLE_ENFORCE_NOT_NULL(step_unit); + step_unit_ = std::move(step_unit); } - const OperatorBase& GetStepNet() const { return *stepnet_; } + const framework::OperatorBase& GetStepUnit() const { return *step_unit_; } const framework::TensorArray& state(const std::string& name) const { - return states_[name]; + auto it = states_.find(name); + PADDLE_ENFORCE(it != states_.end()); + return it->second; } const framework::TensorArray& step_input(const std::string& name) const { - return step_inputs_[name]; + auto it = step_inputs_.find(name); + PADDLE_ENFORCE(it != step_inputs_.end()); + return it->second; } const framework::TensorArray& step_output(const std::string& name) const { - return step_outputs_[name]; + auto it = step_outputs_.find(name); + PADDLE_ENFORCE(it != step_outputs_.end()); + return it->second; } protected: struct ArgCache { framework::Scope const* scope; std::vector* scopes; - std::map inlinks; - std::map outlinks; + std::map inputs; + std::map outputs; + platform::DeviceContext const* dev_ctx; size_t num_steps{0}; - void Init(const rnn::ArgumentName& name, const OperatorBase& op, - const framework::Scope& scope, rnn::Argument* arg); + void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op, + const framework::Scope& scope, + platform::DeviceContext const* dev_ctx, rnn::Argument* arg); framework::Scope& GetScope(size_t index) { PADDLE_ENFORCE_LT(index, num_steps); @@ -133,8 +154,8 @@ class DynamicRecurrentOp : public framework::OperatorBase { const std::string& name); private: - void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op, - rnn::Argument* arg); + void InitArgument(const rnn::ArgumentName& name, + const framework::OperatorBase& op, rnn::Argument* arg); void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg); void CacheInlinks(const framework::Scope& scope, const std::vector& names); @@ -145,27 +166,49 @@ class DynamicRecurrentOp : public framework::OperatorBase { }; private: - std::unique_ptr stepnet_; - mutable std::map states_; - mutable std::map step_inputs_; - mutable std::map step_outputs_; - mutable std::map> - dy_seq_metas_; - mutable rnn::Argument arg_; - mutable ArgCache cache_; + std::unique_ptr step_unit_; + std::map states_; + std::map step_inputs_; + std::map step_outputs_; + std::map> dy_seq_metas_; + rnn::Argument arg_; + ArgCache cache_; + ComputeMode mode_{ComputeMode::kForward}; #ifdef PADDLE_WITH_TESTING - friend class DynamicRecurrentOpTestHelper; - FRIEND_TEST(DynamicRecurrentOpTestHelper, SplitInputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateCache); - FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateScopes); - FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepInputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepOutputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, InitStates); - FRIEND_TEST(DynamicRecurrentOpTestHelper, ConcatOutputs); + // test forward + friend class RNNAlgorithmTestHelper; + FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs); + FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache); + FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes); + FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs); + FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs); + FRIEND_TEST(RNNAlgorithmTestHelper, InitStates); + FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs); +// TODO(superjom) test backward #endif }; +class DynamicRecurrentOp : public framework::OperatorBase { + public: + DynamicRecurrentOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + DynamicRecurrentOp(const DynamicRecurrentOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented"); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override; + + mutable RNNAlgorithm rnn; +}; + class DynamicRecurrentGradientOp : public framework::OperatorBase { public: DynamicRecurrentGradientOp(const std::string& type, @@ -174,8 +217,16 @@ class DynamicRecurrentGradientOp : public framework::OperatorBase { const framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} + DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented"); + } + void Run(const framework::Scope& scope, const platform::DeviceContext& dev_ctx) const override; + + mutable RNNAlgorithm rnn; }; } // namespace operators diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc index 36f405568d7e4ed9a469c3af7a80192b83142b7a..fff63efb24c70b7e864e2d5b011a22883c13dede 100644 --- a/paddle/operators/dynamic_recurrent_op_test.cc +++ b/paddle/operators/dynamic_recurrent_op_test.cc @@ -43,16 +43,16 @@ LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims, return tensor; } -class DynamicRecurrentOpTestHelper : public ::testing::Test { +class RNNAlgorithmTestHelper : public ::testing::Test { protected: - const rnn::ArgumentName argname = DynamicRecurrentOp::kArgName; + const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0]; virtual void SetUp() override { CreateGlobalVariables(); auto op_desc = CreateOpDesc(); op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr); - dop = dynamic_cast(op.get()); + dop = &(dynamic_cast(op.get())->rnn); InitCacheManually(); InitStepNet(); } @@ -63,20 +63,20 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test { op_desc.set_type("dynamic_recurrent"); OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs()); - OpDescNewVar(argname.boot_memories, {"boot_mem"}, op_desc.add_inputs()); + OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs()); OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs()); OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs()); - // set pre-memories + // set pre-states auto pre_memories = op_desc.mutable_attrs()->Add(); - pre_memories->set_name(argname.pre_memories); + pre_memories->set_name(argname.ex_states); pre_memories->set_type(paddle::framework::AttrType::STRINGS); auto pre_memories_item = pre_memories->add_strings(); *pre_memories_item = "mem@pre"; - // set memories + // set states auto memories = op_desc.mutable_attrs()->Add(); - memories->set_name(argname.memories); + memories->set_name(argname.states); memories->set_type(paddle::framework::AttrType::STRINGS); auto memories_item = memories->add_strings(); *memories_item = "mem"; @@ -113,32 +113,33 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test { } void InitCacheManually() { - dop->cache_.Init(DynamicRecurrentOp::kArgName, *dop, scope, &dop->arg_); + dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context, + &dop->arg_); } void InitStepNet() { std::unique_ptr stepnet{new NetOp}; dynamic_cast(stepnet.get()) ->AppendOp(std::unique_ptr(new TestOp( - "test", {{"inlinks", {"in0"}}, {"boot_memories", {"boot_mem"}}}, - {{"outlinks", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {}))); - dop->SetStepNet(std::move(stepnet)); + "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}}, + {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {}))); + dop->SetStepUnit(std::move(stepnet)); } protected: - DynamicRecurrentOp* dop; + RNNAlgorithm* dop; std::unique_ptr op; paddle::platform::CPUDeviceContext device_context; paddle::framework::Scope scope; }; -TEST_F(DynamicRecurrentOpTestHelper, CreateCache) { +TEST_F(RNNAlgorithmTestHelper, CreateCache) { const rnn::Argument& arg = dop->arg_; ASSERT_EQ(arg.inlinks.size(), 1UL); ASSERT_EQ(arg.outlinks.size(), 1UL); } -TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) { +TEST_F(RNNAlgorithmTestHelper, SplitInputs) { dop->SplitInputs(); auto& in0_ta = dop->step_inputs_["in0"]; ASSERT_EQ(in0_ta.size(), 4UL); @@ -153,14 +154,14 @@ TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) { EXPECT_EQ(batch3.dims()[0], 1); } -TEST_F(DynamicRecurrentOpTestHelper, CreateScopes) { +TEST_F(RNNAlgorithmTestHelper, CreateScopes) { dop->SplitInputs(); dop->CreateScopes(); ASSERT_EQ(dop->cache_.num_steps, 4UL); ASSERT_EQ(dop->cache_.scopes->size(), 4UL); } -TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) { +TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) { dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -173,7 +174,7 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) { } } -TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) { +TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) { dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -187,11 +188,12 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) { } } -TEST_F(DynamicRecurrentOpTestHelper, ConcatOutputs) { +TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) { // Let's leave this test to python unittest. } -TEST_F(DynamicRecurrentOpTestHelper, InitStates) { +TEST_F(RNNAlgorithmTestHelper, InitStates) { + dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward); dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -208,12 +210,6 @@ TEST_F(DynamicRecurrentOpTestHelper, InitStates) { auto* boot_state = scope.FindVar("boot_mem"); ASSERT_TRUE(boot_state != nullptr); - - if (step == 0) { - // check pre_state is a reference of boot_state - ASSERT_EQ(boot_state->Get().data(), - pre_state->Get().data()); - } } } diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 3eb97f60b59848d23bcd15ea1e3d2f21b721f6a4..488a35aafc8600bb8bb252fc3a5161c72a2f6df1 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -108,7 +108,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), "Rank of first input must >= rank of second input.") - if (x_dims == y_dims || product(y_dims) == 1) { + if (x_dims == y_dims) { functor f; f.template Run(x, y, z, ctx); return; @@ -174,12 +174,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { return; } - if (product(y_dims) == 1) { - functor1 f; - f(place, x, y, out, dx, dy, dout); - return; - } - int axis = ctx.Attr("axis"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index bf453c85966848d492606644a380a57196ab9869..0f1722a5383c80ff2ede0801d34f22a80fbc6e52 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase { auto &feed_list = feed_var->Get(); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); - out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx); + out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx); out_item->set_lod(feed_item.lod()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 524e77d6ad3a1c7a96e104405827205f704f8a59..c1b3d66bac4c703ce78b247aadc2975bb146b5b0 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx); + dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx); VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; } diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..139392c691e00b2a94f46801f1cfc2018ce139f5 --- /dev/null +++ b/paddle/operators/increment_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/increment_op.h" + +namespace paddle { +namespace operators { + +class IncrementOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IncrementOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of IncrementOp should not be null."); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IncrementOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) The input tensor of increment operator"); + AddOutput("Out", "(Tensor) The output tensor of increment operator."); + AddComment(R"DOC(Increment operator + +The equation is: Out = X + step +)DOC"); + AddAttr("step", + "The step size by which the " + "input tensor will be incremented.") + .SetDefault(1.0); + } +}; + +class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDescBind(); + grad_op->SetType("scale"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("scale", 1.0f); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, + ops::IncrementGradOpMaker); +REGISTER_OP_CPU_KERNEL(increment, + ops::IncrementKernel); diff --git a/paddle/operators/increment_op.cu b/paddle/operators/increment_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..659c380d147a36650452bea23b30cbcf1ff516ee --- /dev/null +++ b/paddle/operators/increment_op.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/increment_op.h" + +REGISTER_OP_GPU_KERNEL( + increment, + paddle::operators::IncrementKernel); diff --git a/paddle/operators/increment_op.h b/paddle/operators/increment_op.h new file mode 100644 index 0000000000000000000000000000000000000000..342e254fc453555c70923efbca02fdfd014af015 --- /dev/null +++ b/paddle/operators/increment_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class IncrementKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* tensor = context.Output("Out"); + auto* in = context.Input("X"); + tensor->mutable_data(in->place()); + + auto step = static_cast(context.Attr("step")); + + auto eigen_out = framework::EigenVector::Flatten(*tensor); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& place = context.GetEigenDevice(); + eigen_out.device(place) = eigen_in + step; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 9c506ae89bdda38f40fb37e4c4e5f990cd5978b7..443c94b83f0bf24837afe703b19e2ab47a0dd786 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -64,7 +64,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -85,8 +85,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), - *context); + output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } EXPECT_EQ(out_cfo_ptr[0], 0); @@ -102,8 +101,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), - *context); + output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context); out_ocf_ptr = output_tmp.data(); } EXPECT_EQ(out_ocf_ptr[0], 0); diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index 14359d835bba794703a313d70f34082868474b20..8b22c71552a65044cbd02441fb35c1eafe0173dc 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); context.Wait(); @@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input1, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input1, *gpu_place, context); out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - out.CopyFrom(out_gpu, *cpu_place, context); + out.CopyFrom(out_gpu, *cpu_place, context); float* out_ptr = out.data(); context.Wait(); @@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + input3.CopyFrom(input3_gpu, *cpu_place, context); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) { auto* gpu_place = new paddle::platform::GPUPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); - input1_gpu.CopyFrom(input1, *gpu_place, context); - input2_gpu.CopyFrom(input2, *gpu_place, context); - input3_gpu.CopyFrom(input3, *gpu_place, context); + input1_gpu.CopyFrom(input1, *gpu_place, context); + input2_gpu.CopyFrom(input2, *gpu_place, context); + input3_gpu.CopyFrom(input3, *gpu_place, context); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); @@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - input3.CopyFrom(input3_gpu, *cpu_place, context); + input3.CopyFrom(input3_gpu, *cpu_place, context); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 8a9f25b98263c3bef50c38f358a20ea98ebe6324..69607c5afc46921c08ce278bf164e5bed7b446f8 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_rows[6], 9); Tensor out_cpu; - out_cpu.CopyFrom(*out_value, cpu_place, ctx); + out_cpu.CopyFrom(*out_value, cpu_place, ctx); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) { add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; - tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx); + tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx); ctx.Wait(); auto* tensor2_cpu_data = tensor2_cpu.data(); diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 2d69218843a69497b5b501d4297f2ec5ab26a844..74590d17cd0f974f830e760d85daef8ab5318a43 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -78,7 +78,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -93,7 +93,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); + output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context); out_cfo_ptr = output_tmp.data(); } @@ -107,7 +107,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - input.CopyFrom(input_tmp, *place, *context); + input.CopyFrom(input_tmp, *place, *context); } paddle::operators::math::Col2VolFunctor col2vol; @@ -118,7 +118,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); + input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context); in_ptr = input_tmp.data(); } diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 8ae54e1eec33c4bce563f697bafbdc68f97ab746..5ce30740c90b5cd0bd4f8ab183cf985ed5d827c1 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -46,7 +46,7 @@ class MatMulKernel : public framework::OpKernel { template inline Tensor Reshape(const Tensor& input, const DDim& dims) { Tensor output; - output.ShareDataWith(input); + output.ShareDataWith(input); output.Resize(dims); return output; } @@ -56,7 +56,7 @@ inline Tensor Reshape(const Tensor& input, const DDim& dims) { template Tensor CombineBatchAndM(const Tensor& input) { Tensor output; - output.ShareDataWith(input); + output.ShareDataWith(input); auto in_dims = input.dims(); if (in_dims.size() == 3) { std::vector out_dims = {in_dims[0] * in_dims[1], in_dims[2]}; @@ -80,7 +80,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize(make_ddim(out_dims)); } else { - output.ShareDataWith(input); + output.ShareDataWith(input); } return output; } diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc index 9be4d15a43d87ae1a27c81498e8b19b0049a3bfa..2d4d6f13720f0e6888edbddcb3243116506227ba 100644 --- a/paddle/operators/momentum_op.cc +++ b/paddle/operators/momentum_op.cc @@ -75,12 +75,17 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("VelocityOut", "(Tensor) Output updated velocity"); AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("useNesterov", "(bool) Use Nesterov Momentum") + .SetDefault(false); AddComment(R"DOC( -Momentum Algorithm (momentum). +Momentum Algorithm with a flag for Nestrov Moemntum (momentum). velocity = mu * velocity + gradient -param = param - learning_rate * velocity +if (use_nesterov): + param = param - gradient * learning_rate + mu * velocity * learning_rate +else: + param = param - learning_rate * velocity )DOC"); } diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h index f7a724f048782ceee8509ddafcb4834fd8dbba8a..e6d6d1da3df9f7e43a93fcc2e12658a01a491f81 100644 --- a/paddle/operators/momentum_op.h +++ b/paddle/operators/momentum_op.h @@ -34,6 +34,7 @@ class MomentumOpKernel : public framework::OpKernel { velocity_out->mutable_data(ctx.GetPlace()); float mu = ctx.Attr("mu"); + bool use_nesterov = ctx.Attr("useNesterov"); auto p_out = framework::EigenVector::Flatten(*param_out); auto v_out = framework::EigenVector::Flatten(*velocity_out); @@ -46,8 +47,14 @@ class MomentumOpKernel : public framework::OpKernel { auto place = ctx.GetEigenDevice(); Eigen::DSizes grad_dsize(grad->numel()); + v_out.device(place) = v * mu + g; - p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + if (use_nesterov) { + p_out.device(place) = p - g * lr.broadcast(grad_dsize) + + v_out * mu * lr.broadcast(grad_dsize); + } else { + p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + } } }; diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 684b1ea0c0c8ddabc9809cc05ed985e0cc250955..3f3e77595b701d428a728fc4727dd3ff4abee45f 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -36,12 +36,12 @@ class MulKernel : public framework::OpKernel { Tensor* z = context.Output("Out"); const Tensor x_matrix = x->dims().size() > 2 - ? framework::ReshapeToMatrix( + ? framework::ReshapeToMatrix( *x, context.template Attr("x_num_col_dims")) : *x; const Tensor y_matrix = y->dims().size() > 2 - ? framework::ReshapeToMatrix( + ? framework::ReshapeToMatrix( *y, context.template Attr("y_num_col_dims")) : *y; @@ -59,30 +59,30 @@ class MulGradKernel : public framework::OpKernel { int y_num_col_dims = ctx.template Attr("y_num_col_dims"); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); - const Tensor x_matrix = - x->dims().size() > 2 ? framework::ReshapeToMatrix(*x, x_num_col_dims) - : *x; - const Tensor y_matrix = - y->dims().size() > 2 ? framework::ReshapeToMatrix(*y, y_num_col_dims) - : *y; + const Tensor x_matrix = x->dims().size() > 2 + ? framework::ReshapeToMatrix(*x, x_num_col_dims) + : *x; + const Tensor y_matrix = y->dims().size() > 2 + ? framework::ReshapeToMatrix(*y, y_num_col_dims) + : *y; const Tensor* dout = ctx.Input(framework::GradVarName("Out")); Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); if (dx) { dx->mutable_data(ctx.GetPlace()); - Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( - *dx, x_num_col_dims) - : *dx; + Tensor dx_matrix = dx->dims().size() > 2 + ? framework::ReshapeToMatrix(*dx, x_num_col_dims) + : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N math::matmul(ctx.device_context(), *dout, false, y_matrix, true, 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); - Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix( - *dy, y_num_col_dims) - : *dy; + Tensor dy_matrix = dy->dims().size() > 2 + ? framework::ReshapeToMatrix(*dy, y_num_col_dims) + : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K math::matmul(ctx.device_context(), x_matrix, true, *dout, false, 1, &dy_matrix, 0); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 10cb0e005f483abe91b4ee862ea5b48305ec08c7..143a14fef5783f8ed085d4c4ce2afb3b190d0600 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -33,8 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), - ctx.device_context()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( ctx.device_context()) @@ -71,8 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), - ctx.device_context()); + index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context()); auto* index = index_t_cpu.data(); auto stream = reinterpret_cast( diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index e3d08378c2f29fa5d84c24ae7cebfcb0e7a53b25..40303e3adf4db7e8336ed72667fe69afa56c3f69 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -42,7 +42,7 @@ void RecurrentAlgorithm::Run(const Scope& scope, for (size_t step_id = 0; step_id < seq_len; step_id++) { if (step_id > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); + rnn::LinkMemories(step_scopes, arg_->states, step_id, -1); } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } @@ -59,7 +59,8 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope, // Now all variables in scope must be created outside of op. PADDLE_ENFORCE_NOT_NULL(stepnet_); - PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs"); + PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), + "step_unit_ op has no outputs"); if (seq_len > step_scopes->size()) { for (size_t i = step_scopes->size(); i < seq_len; ++i) { @@ -86,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope, } void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { - for (auto& attr : arg_->memories) { + for (auto& attr : arg_->states) { auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable(); PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, "memory [%s]'s boot variable [%s] not exists", attr.var, @@ -95,17 +96,17 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { step_scope->FindVar(attr.boot_var)->GetMutable(); pre_mem->Resize(boot_mem->dims()); PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2); - pre_mem->ShareDataWith(*boot_mem); + pre_mem->ShareDataWith(*boot_mem); } } const rnn::ArgumentName RecurrentOp::kArgName{ - "step_net", "step_scopes", "inlinks", "outlinks", - "memories", "pre_memories", "boot_memories"}; + "step_net", "step_scopes", "inputs", "outputs", + "states", "ex_states", "initial_states"}; const rnn::ArgumentName RecurrentGradientOp::kArgName{ - "step_net", "step_scopes@GRAD", "outlinks@GRAD", "inlinks@GRAD", - "memories", "pre_memories", "boot_memories@GRAD"}; + "step_net", "step_scopes@GRAD", "outputs@GRAD", "inputs@GRAD", + "states", "ex_states", "initial_states@GRAD"}; RecurrentOp::RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs, @@ -127,7 +128,7 @@ class RecurrentAlgorithmProtoAndCheckerMaker AddInput(name.inlinks, "the inputs that need to be segmented for each step.") .AsDuplicable(); - AddInput(name.boot_memories, "variables to initialize memories.") + AddInput(name.initial_states, "variables to initialize states.") .AsDuplicable(); AddOutput(name.outlinks, "the outputs that need to concated for all steps.") @@ -135,9 +136,8 @@ class RecurrentAlgorithmProtoAndCheckerMaker AddOutput(name.step_scopes, "step scopes"); // Attributes stored in AttributeMap - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); + AddAttr>(name.ex_states, "names of pre-states"); + AddAttr>(name.states, "names of states"); AddComment("This is a recurrent group operator."); } @@ -152,7 +152,7 @@ void RecurrentGradientAlgorithm::Run( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len); for (int step_id = seq_len - 1; step_id >= 0; --step_id) { if (static_cast(step_id) != seq_len - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + rnn::LinkMemories(step_scopes, arg_->states, step_id, 1); } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } @@ -162,7 +162,7 @@ void RecurrentGradientAlgorithm::Run( void RecurrentGradientAlgorithm::LinkBootMemoryGradients( Scope* step_scope) const { - for (auto& attr : arg_->memories) { + for (auto& attr : arg_->states) { PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, "memory variable [%s] does not exists", attr.var); PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, @@ -171,7 +171,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients( auto* boot_mem_grad = step_scope->Var(attr.boot_var)->GetMutable(); boot_mem_grad->Resize(mem_grad->dims()); - boot_mem_grad->ShareDataWith(*mem_grad); + boot_mem_grad->ShareDataWith(*mem_grad); } } diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 3ba4611458fda0aa2f234c29d27086cd6f5742cc..c89cdf8cab9f209667c5e09b521b8f6e30f202fd 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { std::transform(shape.begin(), shape.end(), shape_int64.begin(), [](int a) { return static_cast(a); }); auto out_dims = framework::make_ddim(shape_int64); - out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); + out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context()); out->Resize(out_dims); } }; @@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel { d_x->mutable_data(ctx.GetPlace()); auto in_dims = d_x->dims(); - d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); + d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context()); d_x->Resize(in_dims); } }; diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc index 30b8ddeb5bc4220e261a5c37ac195b0348fef936..ee61ea300c33722471189d06eb09f67a083d2a4d 100644 --- a/paddle/operators/rnn/recurrent_op_utils.cc +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -36,14 +36,14 @@ void SegmentInputs(const std::vector& step_scopes, LoDTensor* input = input_var->GetMutable(); f::DDim dims = input->dims(); PADDLE_ENFORCE_EQ(static_cast(dims[0]), seq_len, - "all the inlinks be the same length"); + "all the inputs be the same length"); f::DDim step_dims = slice_ddim(dims, 1, dims.size()); for (size_t j = 0; j < seq_len; j++) { Tensor* step_input = step_scopes[j]->Var(inlinks[i])->GetMutable(); // The input of operators of each step is Tensor here. // Maybe need to modify Slice function. - *step_input = input->Slice(j, j + 1); + *step_input = input->Slice(j, j + 1); step_input->Resize(step_dims); } } @@ -71,14 +71,14 @@ void ConcatOutputs(const std::vector& step_scopes, step_scopes[j]->FindVar(outlinks[i])->GetMutable(); // TODO(luotao02) data type and platform::DeviceContext() should set // correctly - (output->Slice(j, j + 1)) - .CopyFrom(*step_output, platform::CPUPlace(), ctx); + (output->Slice(j, j + 1)) + .CopyFrom(*step_output, platform::CPUPlace(), ctx); } } } void LinkMemories(const std::vector& scopes, - const std::vector& memories, + const std::vector& memories, const size_t step_id, const int offset) { PADDLE_ENFORCE_LT(step_id, scopes.size(), "step [%d] is out of range of step scopes' size [%d]", @@ -95,7 +95,7 @@ void LinkMemories(const std::vector& scopes, auto* mem = scope->FindVar(attr.pre_var)->GetMutable(); auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable(); mem->Resize(linked_mem->dims()); - mem->ShareDataWith(*linked_mem); + mem->ShareDataWith(*linked_mem); } } @@ -106,26 +106,26 @@ void InitArgument(const ArgumentName& name, Argument* arg, arg->inlinks = op.Inputs(name.inlinks); arg->outlinks = op.Outputs(name.outlinks); - auto& boot_memories = - is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories); + auto& boot_memories = is_grad ? op.Outputs(name.initial_states) + : op.Inputs(name.initial_states); // attributes - auto& memories = op.Attr>(name.memories); - auto& pre_memories = op.Attr>(name.pre_memories); + auto& memories = op.Attr>(name.states); + auto& pre_memories = op.Attr>(name.ex_states); PADDLE_ENFORCE(memories.size() == boot_memories.size(), - "the size of memories, boot_memories don't match:%d,%d", + "the size of states, initial_states don't match:%d,%d", memories.size(), boot_memories.size()); PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), - "the size of pre_memories, boot_memories don't match:%d,%d", + "the size of ex_states, initial_states don't match:%d,%d", pre_memories.size(), boot_memories.size()); - PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set"); for (size_t i = 0; i < memories.size(); ++i) { - rnn::MemoryAttr mem_attr; + rnn::StateAttr mem_attr; mem_attr.var = memories[i]; mem_attr.pre_var = pre_memories[i]; mem_attr.boot_var = boot_memories[i]; - (arg->memories).push_back(mem_attr); + (arg->states).push_back(mem_attr); } } diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h index fe173edb24ad015b9546546565027358f9b93476..fb0e158e07745d58c6211d33e385b324e492b95e 100644 --- a/paddle/operators/rnn/recurrent_op_utils.h +++ b/paddle/operators/rnn/recurrent_op_utils.h @@ -31,7 +31,7 @@ using Scope = framework::Scope; * boot memories in father scope. Other attributes are copied from Op's proto * attributes. */ -struct MemoryAttr { +struct StateAttr { // name of current state variable std::string var; // name of previous step's state variable @@ -46,7 +46,7 @@ struct Argument { std::string step_scopes; std::vector inlinks; std::vector outlinks; - std::vector memories; + std::vector states; }; struct ArgumentName { @@ -54,9 +54,9 @@ struct ArgumentName { std::string step_scopes; std::string inlinks; std::string outlinks; - std::string memories; // the memory name - std::string pre_memories; // the previous memory name - std::string boot_memories; // the boot memory name + std::string states; // the memory name + std::string ex_states; // the previous memory name + std::string initial_states; // the boot memory name }; /** @@ -74,7 +74,7 @@ void ConcatOutputs(const std::vector& step_scopes, const size_t seq_len, const platform::DeviceContext& ctx); void LinkMemories(const std::vector& step_scopes, - const std::vector& memories, const size_t step_id, + const std::vector& memories, const size_t step_id, const int offset); void InitArgument(const ArgumentName& name, Argument* arg, diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu index 06f4d759447b6dcd28b50576dfc246fc466d9336..3b32ae2fb77a5d3d4c558742ec469c74d15eee07 100644 --- a/paddle/operators/scatter_op.cu +++ b/paddle/operators/scatter_op.cu @@ -30,7 +30,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel { auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*Ref); GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); } @@ -48,7 +48,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + dRef->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Index] GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h index 6101219006414e4865f676e3ca5d2a88949ad17a..1a4f6f99bfe36cd0de2d4f2af3f6054571d8f188 100644 --- a/paddle/operators/scatter_op.h +++ b/paddle/operators/scatter_op.h @@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel { auto *Out = ctx.Output("Out"); // In place output: Out = Ref, Out[Index] += Updates - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*Ref); // Apply ScatterUpdate: Out[index] += Updates[:] ScatterAssign(ctx.device_context(), *Updates, *Index, Out); } @@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *dOut = ctx.Input(framework::GradVarName("Out")); // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + dRef->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates += dO[Index] CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index a197a05bbb881806b24f9dcce5282a4d972e3adc..6adf96120c99f9b84a1ff947058e65ac3ddff1d4 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -87,16 +87,16 @@ class SequenceConcatOpKernel : public framework::OpKernel { auto out_lod_level = out_lod[level]; for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { - Tensor out_t = out->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); + Tensor out_t = out->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); auto out_stride = framework::stride(out_t.dims()); size_t offset = 0; for (size_t j = 0; j < n; ++j) { auto in_lod_level = ins[j]->lod()[level]; auto in_stride = framework::stride(ins[j]->dims()); - Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), - static_cast(in_lod_level[i + 1])); + Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), + static_cast(in_lod_level[i + 1])); size_t axis_dim = in_t.dims()[axis]; StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, in_t.dims(), out_stride, out_t.data() + offset); @@ -130,8 +130,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { Tensor out_grad_t = - out_grad->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); + out_grad->Slice(static_cast(out_lod_level[i]), + static_cast(out_lod_level[i + 1])); auto out_grad_stride = framework::stride(out_grad_t.dims()); size_t offset = 0; @@ -139,8 +139,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { auto x_grad_lod_level = x_grads[j]->lod()[level]; auto x_grad_stride = framework::stride(x_grads[j]->dims()); Tensor x_grad_t = - x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), - static_cast(x_grad_lod_level[i + 1])); + x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), + static_cast(x_grad_lod_level[i + 1])); size_t axis_dim = x_grad_t.dims()[axis]; StridedMemcpy(ctx.device_context(), out_grad_t.data() + offset, out_grad_stride, out_grad_t.dims(), x_grad_stride, diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h index a5569d1aace215c848de43dd9c3dcb414b709083..0de6cafe9ca83f09636a69b5579d19afde1c73b5 100644 --- a/paddle/operators/sequence_pool_op.h +++ b/paddle/operators/sequence_pool_op.h @@ -64,9 +64,9 @@ class SequencePoolKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); auto place = context.GetEigenDevice(); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - Tensor in_t = in->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - Tensor out_t = out->Slice(i, i + 1); + Tensor in_t = in->Slice(static_cast(lod_level_0[i]), + static_cast(lod_level_0[i + 1])); + Tensor out_t = out->Slice(i, i + 1); int64_t h = static_cast(lod_level_0[i + 1] - lod_level_0[i]); auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); auto out_e = EigenVector::Flatten(out_t); @@ -116,9 +116,9 @@ class SequencePoolGradKernel : public framework::OpKernel { } auto place = context.GetEigenDevice(); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - auto in_g_t = in_g->Slice(static_cast(lod[i]), - static_cast(lod[i + 1])); - auto out_g_t = out_g->Slice(i, i + 1); + auto in_g_t = + in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); + auto out_g_t = out_g->Slice(i, i + 1); int64_t h = static_cast(lod[i + 1] - lod[i]); auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h index 96d87c404d217280d74bd088e7a23f539ef6e7ce..3eb1e2844dff6ac94e86dcf4586bb51bc33adbec 100644 --- a/paddle/operators/sequence_softmax_op.h +++ b/paddle/operators/sequence_softmax_op.h @@ -46,8 +46,8 @@ class SequenceSoftmaxKernel : public framework::OpKernel { for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); - Tensor x_i = x->Slice(start_pos, end_pos); - Tensor out_i = out->Slice(start_pos, end_pos); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); @@ -75,9 +75,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); - Tensor out_i = out->Slice(start_pos, end_pos); - Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); - Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu index d03a1a76585bc79633d089b776ca07ba908085ba..68ac2b0ea36dda55ac1161eecb80f03178b4f303 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/operators/softmax_with_cross_entropy_op.cu @@ -85,7 +85,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss"))->data(); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + logit_grad->ShareDataWith(*context.Input("Softmax")); T* logit_grad_data = logit_grad->data(); const int batch_size = logit_grad->dims()[0]; diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h index 66d7bc1569e124096f30b6cd91fe22189506e4a5..01027cf63fc1010a226346609d583af0b400ecbb 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.h +++ b/paddle/operators/softmax_with_cross_entropy_op.h @@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const Tensor* labels = context.Input("Label"); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + logit_grad->ShareDataWith(*context.Input("Softmax")); const int class_num = logit_grad->dims()[1]; if (context.Attr("soft_label")) { diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index 612bdd70db28f2c1fbeb66456fae4ca865530f1f..f244ddc51fab3a6a82ffe517e35a97bc77f61b3e 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -53,10 +53,10 @@ class UniformRandomOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); - auto& dims = ctx->Attrs().Get>("dims"); + auto& shape = ctx->Attrs().Get>("shape"); std::vector temp; - temp.reserve(dims.size()); - for (auto dim : dims) { + temp.reserve(shape.size()); + for (auto dim : shape) { temp.push_back(static_cast(dim)); } ctx->SetOutputDim("Out", framework::make_ddim(temp)); @@ -78,7 +78,7 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC(Uniform random operator. Used to initialize tensor with uniform random generator. )DOC"); - AddAttr>("dims", "the dimension of random tensor"); + AddAttr>("shape", "the dimension of random tensor"); AddAttr("min", "Minimum value of uniform random").SetDefault(-1.0f); AddAttr("max", "Maximun value of uniform random").SetDefault(1.0f); AddAttr("seed", diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 84ebe3c2b84a5b4fd3fb5d49494a19dea873b9c4..e5ddc14587623905dbf52b4c1690236ffeb069a1 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -84,10 +84,12 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("set_float_element", TensorSetElement) @@ -217,8 +219,7 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init<>()) .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, py::return_value_policy::reference) - .def("drop_kids", &Scope::DropKids) - .def_static("global_scope", &GetGlobalScope); + .def("drop_kids", &Scope::DropKids); //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. @@ -412,18 +413,18 @@ All parameter, weight, gradient are variables in Paddle. return static_cast( rnn_op.release()); }) - .def("set_stepnet", + .def("set_step_unit", [](operators::DynamicRecurrentOp &self, const operators::NetOp &net) - -> void { self.SetStepNet(net.Clone()); }) + -> void { self.rnn.SetStepUnit(net.Clone()); }) .def("get_state", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.state(name); }) + -> const TensorArray & { return self.rnn.state(name); }) .def("get_step_input", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.step_input(name); }) + -> const TensorArray & { return self.rnn.step_input(name); }) .def("get_step_output", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.step_output(name); }); + -> const TensorArray & { return self.rnn.step_output(name); }); // cond_op py::class_(m, "CondOp") @@ -449,19 +450,15 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init &>()) - .def("run", - [](Executor &self, ProgramDescBind *program_bind, int block_id) { - framework::Scope &global_scope = GetGlobalScope(); - self.Run(*program_bind->Proto(), &global_scope, block_id); - }); + .def("run", [](Executor &self, ProgramDescBind *program_bind, + Scope *scope, int block_id) { + self.Run(*program_bind->Proto(), scope, block_id); + }); m.def("unique_integer", UniqueIntegerGenerator); m.def("is_compile_gpu", IsCompileGPU); - //! FIXME: it is no need to `set_xxx_float/double/int` - m.def("set_feed_variable_float", framework::SetFeedVariable); - m.def("set_feed_variable_double", framework::SetFeedVariable); - m.def("set_feed_variable_int", framework::SetFeedVariable); + m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); BindProgramDesc(m); diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/framework/executor.py index 8da5daad993e9ceaff93b5271c30a3b260b7abcc..82b83d4bb6ac9d4c6a67d925db290c7c5e2d933f 100644 --- a/python/paddle/v2/framework/executor.py +++ b/python/paddle/v2/framework/executor.py @@ -1,6 +1,8 @@ import paddle.v2.framework.core as core from paddle.v2.framework.framework import Block, Program +g_scope = core.Scope() + class Executor(object): def __init__(self, places): @@ -20,10 +22,14 @@ class Executor(object): feed, fetch_list, feed_var_name='feed', - fetch_var_name='fetch'): + fetch_var_name='fetch', + scope=None): if not isinstance(program, Program): raise TypeError() + if scope is None: + scope = g_scope + program = program.clone() global_block = program.global_block() feed_var = global_block.create_var( @@ -38,8 +44,7 @@ class Executor(object): inputs={'X': [feed_var]}, outputs={'Out': [out]}, attrs={'col': i}) - # FIXME - core.set_feed_variable_float(feed[name], feed_var.name, i) + core.set_feed_variable(scope, feed[name], feed_var.name, i) fetch_var = global_block.create_var( name=fetch_var_name, @@ -52,8 +57,8 @@ class Executor(object): outputs={'Out': [fetch_var]}, attrs={'col': i}) - self.executor.run(program.desc, 0) + self.executor.run(program.desc, scope, 0) return [ - core.get_fetch_variable(fetch_var_name, i) + core.get_fetch_variable(scope, fetch_var_name, i) for i in xrange(len(fetch_list)) ] diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index 622e09fdde9de1f05d141780e9f2fb9fb6416acd..03a3dacf25c2ad5514e914d2f6e9637493ba80f4 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -15,7 +15,7 @@ class Variable(object): shape=None, dtype=None, lod_level=None, - persistable=False, + persistable=None, **kwargs): self.block = block @@ -343,6 +343,8 @@ class Block(object): def create_parameter(self, *args, **kwargs): global_block = self.program.global_block() param = Parameter(global_block, *args, **kwargs) + if 'init_attr' in kwargs: + self._prepend_initialize_ops_(param, kwargs['init_attr']) return param def append_op(self, *args, **kwargs): @@ -401,6 +403,17 @@ class Block(object): for index in range(len(self.ops)): assert self.ops[index].desc == ops_in_cpp[index] + def _prepend_initialize_ops_(self, param, init_attr): + op_type = init_attr['type'] + init_attr['shape'] = param.shape + init_attr['data_type'] = int(param.data_type) + op = self.prepend_op( + type=op_type, + inputs=None, + outputs={'Out': [param]}, + attrs=init_attr) + param.op = op + class Program(object): def __init__(self): @@ -475,27 +488,10 @@ class Parameter(Variable): Variable.__init__( self, block, persistable=True, shape=shape, dtype=dtype, **kwargs) self.trainable = kwargs.get('trainable', True) - self.init_attr = kwargs.get('initialize_attr', { - 'type': 'uniform_random', - 'min': -1.0, - 'max': 1.0 - }) self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0}) - self._append_initialize_ops_() - - def _append_initialize_ops_(self): - attr = self.init_attr - op_type = attr.pop('type', None) - block = self.block - assert isinstance(block, Block) - shape = self.shape - attr['dims'] = shape - attr['data_type'] = int(self.data_type) - op = block.prepend_op( - type=op_type, inputs=None, outputs={'Out': [self]}, attrs=attr) - self.op = op # program is a global instance. g_program = Program() +g_init_program = Program() diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py index 6615bdcd3b1afa493c9ad05c789664818e64d2f2..849a6f43065ae95e908e449e9ef9300b64692e5e 100644 --- a/python/paddle/v2/framework/layer_helper.py +++ b/python/paddle/v2/framework/layer_helper.py @@ -1,4 +1,4 @@ -from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program +from paddle.v2.framework.framework import Variable, OpProtoHolder, g_program, g_init_program import paddle.v2.framework.core as core import copy import itertools @@ -29,6 +29,14 @@ class LayerHelper(object): else: return prog + @property + def init_program(self): + prog = self.kwargs.get('init_program', None) + if prog is None: + return g_init_program + else: + return prog + def append_op(self, *args, **kwargs): return self.program.current_block().append_op(*args, **kwargs) @@ -66,16 +74,14 @@ class LayerHelper(object): actual = self.kwargs.get('param_attr', None) return actual if actual is not None else default - def bias_attr(self, shape, dtype): + def bias_attr(self): bias_attr = self.kwargs.get('bias_attr', None) if bias_attr is True: bias_attr = { 'name': None, 'init_attr': { 'type': 'fill_constant', - 'value': 0.0, - 'shape': shape, - 'dataType': dtype + 'value': 0.0 } } return bias_attr @@ -113,22 +119,27 @@ class LayerHelper(object): def create_parameter(self, attr, shape, dtype, suffix='w'): if attr['name'] is None: attr['name'] = unique_name(".".join([self.name, suffix])) - return self.program.global_block().create_parameter( + self.init_program.global_block().create_parameter( name=attr['name'], dtype=dtype, shape=shape, - initialize_attr=attr['init_attr']) + init_attr=attr['init_attr']) + return self.program.global_block().create_parameter( + name=attr['name'], dtype=dtype, shape=shape) def create_tmp_variable(self, dtype): return self.program.current_block().create_var( - name=unique_name(".".join([self.name, 'tmp'])), dtype=dtype) + name=unique_name(".".join([self.name, 'tmp'])), + dtype=dtype, + persistable=False) def create_global_variable(self, *args, **kwargs): - return self.program.global_block().create_var(*args, **kwargs) + return self.program.global_block().create_var( + *args, persistable=False, **kwargs) def append_bias_op(self, input_var): size = list(input_var.shape[1:]) - bias_attr = self.bias_attr(size, dtype=input_var.data_type) + bias_attr = self.bias_attr() if not bias_attr: return input_var diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py index 236427efcefafd8dc15f3f184f568887fdb00992..ac77aefa15333b06f9803ce1d91071df803483d1 100644 --- a/python/paddle/v2/framework/layers.py +++ b/python/paddle/v2/framework/layers.py @@ -13,7 +13,8 @@ def fc(input, name=None, act=None, num_flatten_dims=1, - program=None): + program=None, + init_program=None): # create helper helper = LayerHelper('fc', **locals()) @@ -59,7 +60,8 @@ def data(name, data_type='float32', type=core.VarDesc.VarType.LOD_TENSOR, append_batch_size=True, - program=None): + program=None, + init_program=None): helper = LayerHelper('data', **locals()) if append_batch_size: shape = [-1] + shape # append batch size as -1 @@ -160,7 +162,8 @@ def conv2d(input, padding=None, bias_attr=None, param_attr=None, - program=None): + program=None, + init_program=None): helper = LayerHelper('conv2d', **locals()) dtype = helper.input_dtype() @@ -207,7 +210,8 @@ def pool2d(input, pool_stride=[1, 1], pool_padding=[0, 0], global_pooling=False, - program=None): + program=None, + init_program=None): if pool_type not in ["max", "avg"]: raise ValueError( "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py index 381da55da3cd4e32fe09241a00d74e74e2de44f7..8a83ebfb9639f6fae6344b68509a80580881dab0 100644 --- a/python/paddle/v2/framework/nets.py +++ b/python/paddle/v2/framework/nets.py @@ -7,18 +7,21 @@ def simple_img_conv_pool(input, pool_size, pool_stride, act, - program=None): + program=None, + init_program=None): conv_out = layers.conv2d( input=input, num_filters=num_filters, filter_size=filter_size, act=act, - program=program) + program=program, + init_program=init_program) pool_out = layers.pool2d( input=conv_out, pool_size=pool_size, pool_type='max', pool_stride=pool_stride, - program=program) + program=program, + init_program=init_program) return pool_out diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py index e356a7aadb8d6a87d0fe54a5dd2a11fea0d80a74..f992a42c40a6e9e76fd7d0b7ecf9586f01fab645 100644 --- a/python/paddle/v2/framework/optimizer.py +++ b/python/paddle/v2/framework/optimizer.py @@ -1,32 +1,104 @@ import paddle.v2.framework.framework as framework +from collections import defaultdict -__all__ = ['SGDOptimizer'] +__all__ = ['SGDOptimizer', 'MomentumOptimizer'] class Optimizer(object): """Optimizer Base class. Define the common interface of an optimizer. - User should not use this class directly, but need to use one of it's implementation. + User should not use this class directly, + but need to use one of it's implementation. """ def __init__(self): - pass + # Dictionary of accumulators. Some optimizer subclasses need to + # allocate and manage extra variables associated with the parameters + # to train. These variables are called accumulators. + # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} + self._accumulators = defaultdict(lambda: dict()) def _append_optimize_op(self, block, param_and_grad): """ append optimize operator to block and return all the added optimize_op """ raise NotImplementedError() - def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + def _initialize_tensors(self, block): + """Create all necessary tensors, that will be shared for all parameter updates. + + Tensors like learning rate should be initialized here. + + Args: + block: the block in which the loss variable is present + """ + pass + + def _create_accumulators(self, block, parameters): + """Create all accumulators needed by the parameters + + Args: + block: the block in which the loss variable is present + parameters: list of parameter variables for the optimizer """ - create and add gradient Operators in BlockDesc to Compute gradients of `loss` - for parameters in parameter_list + pass + + def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0): + """Utility function to add an accumulator for a parameter + + Args: + block: the block in which the loss variable is present + name: name of the accumulator + param: parameter variable for which accumulator is to be added + dtype: data type of the accumulator variable + fill_value: value to initialize the accumulator variable + """ + if (name in self._accumulators and + param.name in self._accumulators[name]): + raise Exception("Accumulator {} already exists for parmeter {}". + format(name, param.name)) + global_block = block.program.global_block() + param_shape = list(param.shape) + param_acc = global_block.create_var( + dtype=dtype, shape=param_shape, lod_level=0) + + # Initialize the accumulator with fill_value + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + global_block.append_op( + type="fill_constant", + outputs={"Out": param_acc}, + attrs={"shape": param_shape, + "value": fill_value}) + + # Add to accumulators dict + self._accumulators[name][param.name] = param_acc + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + + Returns: + accumulator variable for the parameter + """ + if (name not in self._accumulators or + param.name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, param.name)) + return self._accumulators[name][param.name] + + def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + """Create and add gradient Operators in BlockDesc to compute + gradients of `loss` for parameters in parameter_list Args: loss: an variable generated by cost function. no_grad_set: variable that should not create gradient - parameter_list: parameters that need to compute gradient and update to optimize the lost. + parameter_list: parameters that need to compute gradient and + update to optimize the lost. Returns: list of (parameters, gradients) pair. @@ -48,7 +120,8 @@ class Optimizer(object): if not grad_block.has_var(grad_info[0]): raise Exception("grad block[%d] did not have grad var %s" % grad_info[1], grad_info[0]) - param_var = loss.block.var(param) + # Get the param var from the global block + param_var = loss.block.program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) @@ -64,14 +137,29 @@ class Optimizer(object): parameters_and_grads: a list of (variable, gradient) pair to update. Returns: - optmization_op_list: a list of optimization operator that will update parameter using gradient. + optmization_op_list: a list of optimization operator that will update + parameter using gradient. """ + # This is a default implementation of create_optimization_pass that + # can be shared by most optimizers. This implementation assumes that + # the subclass will implement the _append_optimize_op method and the + # _initialize_tensors method. The subclass can extend the + # _create_accumulators method if it needs to create accumulators + # for parameters. + + # Create any accumulators + self._create_accumulators(loss.block, + [p[0] for p in parameters_and_grads]) + # Create any necessary tensors + self._initialize_tensors(loss.block) + optimize_ops = [] for param_and_grad in parameters_and_grads: if param_and_grad[1] is not None: optimize_op = self._append_optimize_op(loss.block, param_and_grad) optimize_ops.append(optimize_op) + return optimize_ops def minimize(self, loss, parameter_list=None, no_grad_set=None): @@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer): def __init__(self, learning_rate): assert learning_rate is not None - super(Optimizer, self).__init__() + super(SGDOptimizer, self).__init__() self.type = "sgd" self._learning_rate = learning_rate - def _append_optimize_op(self, block, param_and_grad): + def _initialize_tensors(self, block): assert isinstance(block, framework.Block) lr_shape = [1] - # create a var for learning_rate - lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0) + # create a variable for learning_rate + self._lr = block.create_var( + dtype="float32", shape=lr_shape, lod_level=0) # create an op to init the learning_rate - init_op = block.append_op( + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + block.append_op( type="fill_constant", - outputs={"Out": lr}, + outputs={"Out": self._lr}, attrs={"shape": lr_shape, "value": self._learning_rate}) + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + # create the optimize op sgd_op = block.append_op( type=self.type, inputs={ "Param": param_and_grad[0], "Grad": param_and_grad[1], - "LearningRate": lr + "LearningRate": self._lr }, - outputs={"ParamOut": param_and_grad[0]}, - attrs={"shape": [1], - "value": self._learning_rate}) + outputs={"ParamOut": param_and_grad[0]}) return sgd_op + + +class MomentumOptimizer(Optimizer): + """Simple Momentum optimizer with velocity state + """ + _velocity_acc_str = "velocity" + + def __init__(self, learning_rate, momentum): + assert learning_rate is not None + assert momentum is not None + super(MomentumOptimizer, self).__init__() + self.type = "momentum" + self._learning_rate = learning_rate + self._momentum = momentum + + def _initialize_tensors(self, block): + assert isinstance(block, framework.Block) + lr_shape = [1] + # create a variable for learning_rate + self._lr = block.create_var( + dtype="float32", shape=lr_shape, lod_level=0) + + # create an op to init the learning_rate + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + block.append_op( + type="fill_constant", + outputs={"Out": self._lr}, + attrs={"shape": lr_shape, + "value": self._learning_rate}) + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator(block, self._velocity_acc_str, p, 'float32') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + velocity_acc = self._get_accumulator(self._velocity_acc_str, + param_and_grad[0]) + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": self._lr + }, + outputs={ + "ParamOut": param_and_grad[0], + "VelocityOut": velocity_acc + }, + attrs={"mu": self._momentum}) + + return momentum_op diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/framework/tests/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..28433306d49112cc860f4ace9efca2b2d70deb3f --- /dev/null +++ b/python/paddle/v2/framework/tests/.gitignore @@ -0,0 +1 @@ +image/ diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 919b6c3f6745a9c6115e7af857c1a30354305f89..e1c45c2674ee9cc7c7240bdd67de05cb218ac287 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest): self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} - self.attrs = {"softLabel": False} + self.attrs = {"soft_label": False} def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py index 2b01e43454e70c12b423db9925837cf336f79935..fa2ccd0c3b74a2ee8b8fd9eb8986cb79ff07c98e 100644 --- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py +++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py @@ -4,6 +4,12 @@ import unittest from paddle.v2.framework.op import Operator, DynamicRecurrentOp import numpy as np +# for siplicity, just one level LoD +lod_py = [[0, 4, 7, 9, 10]] +input_dim = 30 +num_sents = len(lod_py[0]) - 1 +weight_dim = 15 + def create_tensor(scope, name, shape, np_data): tensor = scope.var(name).get_tensor() @@ -12,6 +18,17 @@ def create_tensor(scope, name, shape, np_data): return tensor +class PyRNNStep(object): + def __init__(self): + + self.x = np.random.normal(size=(lod_py[0][-1], + input_dim)).astype("float32") + self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32") + self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32") + self.h_boot = np.random.normal(size=(num_sents, + input_dim)).astype("float32") + + class DynamicRecurrentOpTest(unittest.TestCase): ''' Test RNNOp @@ -23,17 +40,13 @@ class DynamicRecurrentOpTest(unittest.TestCase): - U vars: - x - memories: + states: - h outputs: - h ''' - # for siplicity, just one level LoD - lod_py = [[0, 4, 7, 9, 10]] - input_dim = 30 - num_sents = len(lod_py[0]) - 1 - weight_dim = 15 + py = PyRNNStep() def forward(self): self.scope = core.Scope() @@ -42,64 +55,55 @@ class DynamicRecurrentOpTest(unittest.TestCase): self.create_step_net() ctx = core.DeviceContext.create(core.CPUPlace()) self.rnnop.run(self.scope, ctx) - state = self.rnnop.get_state("h@mem") + state = self.rnnop.get_state("h@state") print 'state size: ', state.size() step_inputs = self.rnnop.get_step_input("x") print "x size ", step_inputs.size() for i in range(step_inputs.size()): print "x %d" % i, np.array(step_inputs.read(i).get_dims()) - step_outputs = self.rnnop.get_step_output('h@mem') + step_outputs = self.rnnop.get_step_output('h@state') print 'step_outputs.size ', step_outputs.size() - output = self.scope.find_var("h@mem").get_tensor() - + output = self.scope.find_var("h@state").get_tensor() print 'output', np.array(output).shape def create_global_variables(self): - x = np.random.normal(size=(self.lod_py[0][-1], - self.input_dim)).astype("float32") - W = np.random.normal(size=(self.input_dim, - self.input_dim)).astype("float32") - U = np.random.normal(size=(self.input_dim, - self.input_dim)).astype("float32") - h_boot = np.random.normal(size=(self.num_sents, - self.input_dim)).astype("float32") # create inlink - x_tensor = create_tensor(self.scope, "x", - [self.num_sents, self.input_dim], x) - x_tensor.set_lod(self.lod_py) - create_tensor(self.scope, "W", [self.input_dim, self.input_dim], W) - create_tensor(self.scope, "U", [self.input_dim, self.input_dim], U) - create_tensor(self.scope, "h_boot", [self.num_sents, self.input_dim], - h_boot) + x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim], + self.py.x) + x_tensor.set_lod(lod_py) + create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W) + create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U) + create_tensor(self.scope, "h_boot", [num_sents, input_dim], + self.py.h_boot) self.scope.var("step_scopes") - self.scope.var("h@mem") + self.scope.var("h@state") def create_rnn_op(self): # create RNNOp self.rnnop = DynamicRecurrentOp( # inputs - inlinks=["x"], - boot_memories=["h_boot"], - step_net="stepnet", + inputs=["x"], + initial_states=["h_boot"], + step_net="step_unit", # outputs - outlinks=["h@mem"], + outputs=["h@state"], step_scopes="step_scopes", # attributes - pre_memories=["h@pre"], - memories=["h@mem"]) + ex_states=["h@pre"], + states=["h@state"]) def create_step_net(self): - stepnet = core.Net.create() + step_unit = core.Net.create() x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum") - sig_op = Operator("sigmoid", X="sum", Y="h@mem") + sig_op = Operator("sigmoid", X="sum", Y="h@state") for op in [x_fc_op, h_fc_op, sum_op, sig_op]: - stepnet.append_op(op) - stepnet.complete_add_op(True) - self.rnnop.set_stepnet(stepnet) + step_unit.append_op(op) + step_unit.complete_add_op(True) + self.rnnop.set_step_unit(step_unit) def test_forward(self): print 'test recurrent op forward' @@ -107,5 +111,58 @@ class DynamicRecurrentOpTest(unittest.TestCase): print 'pd_output', pd_output +class RecurrentGradientOpTest(unittest.TestCase): + py = PyRNNStep() + + def create_forward_op(self): + # create RNNOp + self.forward_op = DynamicRecurrentOp( + # inputs + inputs=["x"], + initial_states=["h_boot"], + step_net="step_unit", + # outputs + outputs=["h@state"], + step_scopes="step_scopes", + # attributes + ex_states=["h@pre"], + states=["h@state"]) + + def create_gradient_op(self): + a = set() + backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a) + + def create_step_net(self): + step_unit = core.Net.create() + x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") + h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") + sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum") + sig_op = Operator("sigmoid", X="sum", Y="h@state") + + for op in [x_fc_op, h_fc_op, sum_op, sig_op]: + step_unit.append_op(op) + step_unit.complete_add_op(True) + self.forward_op.set_step_unit(step_unit) + + def create_global_variables(self): + # create inlink + x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim], + self.py.x) + x_tensor.set_lod(lod_py) + create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W) + create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U) + create_tensor(self.scope, "h_boot", [num_sents, input_dim], + self.py.h_boot) + self.scope.var("step_scopes") + self.scope.var("h@state") + + def test_grad(self): + self.scope = core.Scope() + self.create_forward_op() + self.create_global_variables() + self.create_step_net() + self.create_gradient_op() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py index f3101a709b8bcf58e8682ab3d0ca5217a7f3572d..57daddd5698f77527bc5b78c436065a851867ae0 100644 --- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py +++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py @@ -92,5 +92,33 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseOp): } +class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_add" + self.inputs = { + 'X': np.random.rand(2, 3, 4).astype(np.float32), + 'Y': np.random.rand(3, 4).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4) + } + + +class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_add" + self.inputs = { + 'X': np.random.rand(2, 1).astype(np.float32), + 'Y': np.random.rand(1).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1) + } + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/framework/tests/test_feed_fetch_method.py index 47eedddcb6f47927ea3918d7f6c379c5710592c6..fbd659ece0188140e197982ea818d7c3897daf4e 100644 --- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py +++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py @@ -5,6 +5,7 @@ import numpy as np class TestFeedFetch(unittest.TestCase): def test_feed_fetch(self): + scope = core.Scope() place = core.CPUPlace() input_array = np.ones((4, 4, 6)).astype("float32") input_array[0, 0, 0] = 3 @@ -12,9 +13,9 @@ class TestFeedFetch(unittest.TestCase): input_tensor = core.LoDTensor([[0, 2, 4]]) input_tensor.set(input_array, place) - core.set_feed_variable_float(input_tensor, "feed", 0) + core.set_feed_variable(scope, input_tensor, "feed", 0) - output_tensor = core.get_fetch_variable("feed", 0) + output_tensor = core.get_fetch_variable(scope, "feed", 0) output_lod = output_tensor.lod() self.assertEqual(0, output_lod[0][0]) diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py new file mode 100644 index 0000000000000000000000000000000000000000..b20e3357894c2bacad83f0a99632710c586602de --- /dev/null +++ b/python/paddle/v2/framework/tests/test_fit_a_line.py @@ -0,0 +1,73 @@ +import paddle.v2 as paddle +import paddle.v2.framework.layers as layers +import paddle.v2.framework.core as core +import paddle.v2.framework.optimizer as optimizer + +from paddle.v2.framework.framework import Program, g_program +from paddle.v2.framework.executor import Executor + +import numpy as np + +init_program = Program() +program = Program() +x = layers.data( + name='x', + shape=[13], + data_type='float32', + program=program, + init_program=init_program) + +y_predict = layers.fc(input=x, + size=1, + act=None, + program=program, + init_program=init_program) + +y = layers.data( + name='y', + shape=[1], + data_type='float32', + program=program, + init_program=init_program) + +cost = layers.square_error_cost( + input=y_predict, label=y, program=program, init_program=init_program) +avg_cost = layers.mean(x=cost, program=program, init_program=init_program) + +sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +opts = sgd_optimizer.minimize(avg_cost) + +BATCH_SIZE = 20 + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = core.CPUPlace() +exe = Executor(place) + +exe.run(init_program, feed={}, fetch_list=[]) + +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + for data in train_reader(): + x_data = np.array(map(lambda x: x[0], data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("float32") + + tensor_x = core.LoDTensor() + tensor_x.set(x_data, place) + # print tensor_x.get_dims() + + tensor_y = core.LoDTensor() + tensor_y.set(y_data, place) + # print tensor_y.get_dims() + outs = exe.run(program, + feed={'x': tensor_x, + 'y': tensor_y}, + fetch_list=[avg_cost]) + out = np.array(outs[0]) + + if out[0] < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) diff --git a/python/paddle/v2/framework/tests/test_increment_op.py b/python/paddle/v2/framework/tests/test_increment_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e174272b05b9413cc2bc1e099c4dd17899829e76 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_increment_op.py @@ -0,0 +1,41 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestIncrementOpPositiveStep(OpTest): + """Test increment op with positive step + """ + + def setUp(self): + self.op_type = "increment" + self.inputs = {'X': np.random.random((10, 10)).astype("float32")} + self.attrs = {'step': 14.8} + self.outputs = {'Out': self.inputs['X'] + self.attrs['step']} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestIncrementOpNegativeStep(OpTest): + """Test increment op with negative step + """ + + def setUp(self): + self.op_type = "increment" + self.inputs = {'X': np.random.random((10, 10)).astype("float32")} + self.attrs = {'step': -3.8} + self.outputs = {'Out': self.inputs['X'] + self.attrs['step']} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/framework/tests/test_momentum_op.py index d3353ff6e4f4da32eaefdd4e816a621ddac8bece..654d31975aab4578055e7e70ade202bd2c3d93cb 100644 --- a/python/paddle/v2/framework/tests/test_momentum_op.py +++ b/python/paddle/v2/framework/tests/test_momentum_op.py @@ -3,7 +3,7 @@ import numpy as np from op_test import OpTest -class TestMomentumOp(OpTest): +class TestMomentumOp1(OpTest): def setUp(self): self.op_type = "momentum" @@ -12,6 +12,7 @@ class TestMomentumOp(OpTest): velocity = np.zeros((123, 321)).astype("float32") learning_rate = np.array([0.001]).astype("float32") mu = 0.0001 + use_nesterov = False self.inputs = { 'Param': param, @@ -23,7 +24,47 @@ class TestMomentumOp(OpTest): self.attrs = {'mu': mu} velocity_out = mu * velocity + grad - param_out = param - learning_rate * velocity_out + if use_nesterov: + param_out = param - grad * learning_rate + \ + velocity_out * mu * learning_rate + else: + param_out = param - learning_rate * velocity_out + + self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + + def test_check_output(self): + self.check_output() + + +class TestMomentumOp2(OpTest): + '''Test Momentum with defaukt values for attributes + ''' + + def setUp(self): + self.op_type = "momentum" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + velocity = np.zeros((123, 321)).astype("float32") + learning_rate = np.array([0.001]).astype("float32") + mu = 0.0001 + use_nesterov = True + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Velocity': velocity, + 'LearningRate': learning_rate + } + + self.attrs = {'mu': mu, 'useNesterov': use_nesterov} + + velocity_out = mu * velocity + grad + if use_nesterov: + param_out = param - grad * learning_rate + \ + velocity_out * mu * learning_rate + else: + param_out = param - learning_rate * velocity_out self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py index 3d6fa70737bf360df53785dc602feceda471ee70..e6a142ac361b572c8df42dbb5cd1b116584ed324 100644 --- a/python/paddle/v2/framework/tests/test_optimizer.py +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer class TestOptimizer(unittest.TestCase): def test_sgd_optimizer(self): - program = framework.g_program + program = framework.Program() block = program.global_block() mul_x = block.create_parameter( dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") @@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase): dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - mul_op = block.append_op( + block.append_op( type="mul", inputs={"X": mul_x, "Y": mul_y}, @@ -27,5 +27,47 @@ class TestOptimizer(unittest.TestCase): self.assertEqual(sgd_op.type, "sgd") +class TestMomentumOptimizer(unittest.TestCase): + class MockMomentum(optimizer.MomentumOptimizer): + def get_accumulators(self): + return self._accumulators + + def get_velocity_str(self): + return self._velocity_acc_str + + def test_momentum_optimizer(self): + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2) + params_grads = momentum_optimizer.create_backward_pass(mul_out) + self.assertEqual(len(params_grads), 1) + self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) + opts = momentum_optimizer.create_optimization_pass(params_grads, + mul_out) + self.assertEqual(len(opts), 1) + sgd_op = opts[0] + self.assertEqual(sgd_op.type, "momentum") + + # Check accumulators + accumulators = momentum_optimizer.get_accumulators() + self.assertEqual(len(accumulators), 1) + self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators) + velocity_acc = accumulators[momentum_optimizer.get_velocity_str()] + self.assertEqual(len(velocity_acc), 1) + self.assertTrue(mul_x.name in velocity_acc) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..2b305213df424dd097bf4238aa14320a2f7da45d --- /dev/null +++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py @@ -0,0 +1,92 @@ +import paddle.v2 as paddle +import paddle.v2.framework.layers as layers +import paddle.v2.framework.nets as nets +import paddle.v2.framework.core as core +import paddle.v2.framework.optimizer as optimizer + +from paddle.v2.framework.framework import Program, g_program +from paddle.v2.framework.executor import Executor + +import numpy as np + +init_program = Program() +program = Program() + +images = layers.data( + name='pixel', + shape=[1, 28, 28], + data_type='float32', + program=program, + init_program=init_program) +label = layers.data( + name='label', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) +conv_pool_1 = nets.simple_img_conv_pool( + input=images, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu", + program=program, + init_program=init_program) +conv_pool_2 = nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu", + program=program, + init_program=init_program) + +predict = layers.fc(input=conv_pool_2, + size=10, + act="softmax", + program=program, + init_program=init_program) +cost = layers.cross_entropy( + input=predict, label=label, program=program, init_program=init_program) +avg_cost = layers.mean(x=cost, program=program) + +sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +opts = sgd_optimizer.minimize(avg_cost) + +BATCH_SIZE = 50 +PASS_NUM = 1 +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = core.CPUPlace() +exe = Executor(place) + +exe.run(init_program, feed={}, fetch_list=[]) + +for pass_id in range(PASS_NUM): + count = 0 + for data in train_reader(): + img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), + data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int32") + y_data = y_data.reshape([BATCH_SIZE, 1]) + + tensor_img = core.LoDTensor() + tensor_y = core.LoDTensor() + tensor_img.set(img_data, place) + tensor_y.set(y_data, place) + + outs = exe.run(program, + feed={"pixel": tensor_img, + "label": tensor_y}, + fetch_list=[avg_cost]) + + loss = np.array(outs[0]) + + if loss < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..a985d1f3d38fcaa8372a70edd519b873d47f554a --- /dev/null +++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py @@ -0,0 +1,83 @@ +import paddle.v2 as paddle +import paddle.v2.framework.layers as layers +import paddle.v2.framework.core as core +import paddle.v2.framework.optimizer as optimizer + +from paddle.v2.framework.framework import Program, g_program +from paddle.v2.framework.executor import Executor + +import numpy as np + +init_program = Program() +program = Program() +image = layers.data( + name='x', + shape=[784], + data_type='float32', + program=program, + init_program=init_program) + +hidden1 = layers.fc(input=image, + size=128, + act='relu', + program=program, + init_program=init_program) +hidden2 = layers.fc(input=hidden1, + size=64, + act='relu', + program=program, + init_program=init_program) + +predict = layers.fc(input=hidden2, + size=10, + act='softmax', + program=program, + init_program=init_program) + +label = layers.data( + name='y', + shape=[1], + data_type='int32', + program=program, + init_program=init_program) + +cost = layers.cross_entropy( + input=predict, label=label, program=program, init_program=init_program) +avg_cost = layers.mean(x=cost, program=program, init_program=init_program) + +sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) +opts = sgd_optimizer.minimize(avg_cost) + +BATCH_SIZE = 128 + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=8192), + batch_size=BATCH_SIZE) + +place = core.CPUPlace() +exe = Executor(place) + +exe.run(init_program, feed={}, fetch_list=[]) + +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + for data in train_reader(): + x_data = np.array(map(lambda x: x[0], data)).astype("float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int32") + y_data = np.expand_dims(y_data, axis=1) + + tensor_x = core.LoDTensor() + tensor_x.set(x_data, place) + + tensor_y = core.LoDTensor() + tensor_y.set(y_data, place) + + outs = exe.run(program, + feed={'x': tensor_x, + 'y': tensor_y}, + fetch_list=[avg_cost]) + out = np.array(outs[0]) + if out[0] < 5.0: + exit(0) # if avg cost less than 5.0, we think our code is good. +exit(1) diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py index 191ce0b0c8d5fb6c4d8037a6c1bfda57c394489e..cc4008c0d8e73a3f7d9a9be2a4aacfd120ecd522 100644 --- a/python/paddle/v2/framework/tests/test_recurrent_op.py +++ b/python/paddle/v2/framework/tests/test_recurrent_op.py @@ -132,15 +132,15 @@ class RecurrentOpTest(unittest.TestCase): # create RNNOp self.rnnop = RecurrentOp( # inputs - inlinks=["x"], - boot_memories=["h_boot"], + inputs=["x"], + initial_states=["h_boot"], step_net="stepnet", # outputs - outlinks=["h@mem"], + outputs=["h@mem"], step_scopes="step_scopes", # attributes - pre_memories=["h@pre"], - memories=["h@mem"]) + ex_states=["h@pre"], + states=["h@mem"]) def create_step_net(self): stepnet = core.Net.create() @@ -169,15 +169,15 @@ class RecurrentGradientOpTest(unittest.TestCase): def create_forward_op(self): self.forward_op = RecurrentOp( # inputs - inlinks=["x"], - boot_memories=["h_boot"], + inputs=["x"], + initial_states=["h_boot"], step_net="stepnet", # outputs - outlinks=["h"], + outputs=["h"], step_scopes="step_scopes", # attributes - pre_memories=["h@pre"], - memories=["h@alias"]) + ex_states=["h@pre"], + states=["h@alias"]) # create a stepnet for RNN stepnet = core.Net.create() diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/framework/tests/test_rmsprop_op.py index 3e5ff733e9b55fe8c9727e9721e25083a494be15..237bcfccceee89f62fc05e4c6c972a76d1875367 100644 --- a/python/paddle/v2/framework/tests/test_rmsprop_op.py +++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py @@ -46,7 +46,7 @@ class TestRmspropOp1(OpTest): class TestRmspropOp2(OpTest): - '''Test RMSProp with defaukt values for attributes + '''Test RMSProp with default values for attributes ''' def setUp(self): diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py index a2d28a65a67b03a6c74348b19ba99cffc55738e9..ded777105e0fc64eb82bf4013bfba7ba9d0ddefa 100644 --- a/python/paddle/v2/framework/tests/test_uniform_random_op.py +++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py @@ -19,7 +19,7 @@ class TestUniformRandomOp(unittest.TestCase): op = Operator( "uniform_random", Out='X', - dims=[1000, 784], + shape=[1000, 784], min=-5.0, max=10.0, seed=10)