diff --git a/.gitignore b/.gitignore index ac56a3320ec85769d2c87c072512f5217eca0c24..59e650bdfe801c7e2ff19b6c0a9d60bed1e1ee10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +paddle/operators/check_t.save +paddle/operators/check_tensor.ls +paddle/operators/tensor.save +python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/ +python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/ +python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/ *.DS_Store build/ build_doc/ diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 6bea7cf3022242ce48cc882915f7e71810937283..de94bd5008effef1bf0fd3a125d4aed56e1b7f81 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 33ef6860e1d38f4e87c4431addf43f9f8a655fc2..1cb54ba2164fafbfce9f28a3e894ae5e78a9cd68 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -179,20 +179,24 @@ function(cc_library TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if (cc_library_SRCS) - if (cc_library_SHARED OR cc_library_shared) # build *.so + if(cc_library_SRCS) + if(cc_library_SHARED OR cc_library_shared) # build *.so add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) else() add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) endif() - if (cc_library_DEPS) + if(cc_library_DEPS) # Don't need link libwarpctc.so - if ("${cc_library_DEPS};" MATCHES "warpctc;") + if("${cc_library_DEPS};" MATCHES "warpctc;") list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) + target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + if("${cc_library_DEPS}" MATCHES "ARCHIVE_START") + list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END) + endif() add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) endif() # cpplint code style diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index e24613b94b422b7cdf9c6383c359fa92a4faf6ff..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -323,6 +323,12 @@ batch_norm .. autofunction:: paddle.v2.fluid.layers.batch_norm :noindex: +layer_norm +---------- + +.. autofunction:: paddle.v2.fluid.layers.layer_norm + :noindex: + beam_search_decode ------------------ diff --git a/doc/howto/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst index 9ecab5594cff47cde4700b7ce0f58013a960a16e..bcc8c2f46eb662ec3650e829a77992224dbbb8e7 100644 --- a/doc/howto/rnn/index_cn.rst +++ b/doc/howto/rnn/index_cn.rst @@ -1,4 +1,4 @@ -RNN相关模型 +RNN模型 =========== .. toctree:: diff --git a/doc/index_cn.rst b/doc/index_cn.rst index 63a78428583477792e309a3b3d26af340caccfca..0f645db6fc5d0f84bbe0cbb335677752e3a355ea 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -8,5 +8,4 @@ PaddlePaddle 文档 build_and_install/index_cn.rst howto/index_cn.rst dev/index_cn.rst - api/index_cn.rst faq/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 5631381be087017c26b2a6a3984b3c5bdb49f12d..166f56c28f464563a0b36007f58cebb58c286916 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -8,4 +8,3 @@ PaddlePaddle Documentation build_and_install/index_en.rst howto/index_en.rst dev/index_en.rst - api/index_en.rst diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index dd2ed87252102aee6d384f37365d19305f19b281..3e344ea3790f57b0f53f36a40263dcdd326e67a9 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, : prog_(prog), desc_(desc) { need_update_ = true; for (auto &op : other.ops_) { - ops_.emplace_back(new OpDesc(*op, this)); + ops_.emplace_back(new OpDesc(*op->Proto(), prog, this)); } - for (auto &it : other.vars_) { auto *var = new VarDesc(*it.second); vars_[it.first].reset(var); diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h index b679387b1124e42499df158767b6c7afe1afd0c6..146f0e9e71ea9101a8f6c71e6c023178f131f967 100644 --- a/paddle/framework/channel.h +++ b/paddle/framework/channel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc index a307abb4ed37880bb289a8373adf0d293382c97e..d7140dd10661c7b8582930b47872ab0b330c4d66 100644 --- a/paddle/framework/channel_test.cc +++ b/paddle/framework/channel_test.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,6 +25,26 @@ using paddle::framework::CloseChannel; using paddle::framework::details::Buffered; using paddle::framework::details::UnBuffered; +void RecevingOrderEqualToSendingOrder(Channel *ch) { + unsigned sum_send = 0; + std::thread t([&]() { + for (int i = 0; i < 5; i++) { + EXPECT_EQ(ch->Send(&i), true); + sum_send += i; + } + }); + for (int i = 0; i < 5; i++) { + int recv; + EXPECT_EQ(ch->Receive(&recv), true); + EXPECT_EQ(recv, i); + } + + CloseChannel(ch); + t.join(); + EXPECT_EQ(sum_send, 10U); + delete ch; +} + TEST(Channel, MakeAndClose) { using paddle::framework::details::Buffered; using paddle::framework::details::UnBuffered; @@ -137,9 +157,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { for (size_t i = 0; i < buffer_size; ++i) { EXPECT_EQ(ch->Receive(&out), - false); // after receiving residual values, return zeros. - // Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't - // define zero values like Go does. + false); // receiving on closed channel should return false } delete ch; } @@ -158,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { sum += i; } }); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec EXPECT_EQ(sum, 45U); CloseChannel(ch); @@ -166,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { delete ch; } -TEST(Channel, SimpleUnbufferedChannelTest) { +TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { auto ch = MakeChannel(0); - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < 5; i++) { - EXPECT_EQ(ch->Send(&i), true); - sum_send += i; - } - }); - for (int i = 0; i < 5; i++) { - int recv; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } + RecevingOrderEqualToSendingOrder(ch); +} - CloseChannel(ch); - t.join(); - EXPECT_EQ(sum_send, 10U); - delete ch; +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { + auto ch = MakeChannel(10); + RecevingOrderEqualToSendingOrder(ch); } -// This tests that closing a buffered channel also unblocks -// any receivers waiting on the channel -TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(1); +void ChannelCloseUnblocksReceiversTest(Channel *ch) { size_t num_threads = 5; std::thread t[num_threads]; bool thread_ended[num_threads]; @@ -201,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { t[i] = std::thread( [&](bool *p) { int data; - // All reads should return false EXPECT_EQ(ch->Receive(&data), false); *p = true; }, &thread_ended[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec - // Verify that all threads are blocked + // Verify that all the threads are blocked for (size_t i = 0; i < num_threads; i++) { EXPECT_EQ(thread_ended[i], false); } @@ -218,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { // This should unblock all receivers CloseChannel(ch); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { @@ -226,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { } for (size_t i = 0; i < num_threads; i++) t[i].join(); - delete ch; } -// This tests that closing a buffered channel also unblocks -// any senders waiting for channel to have write space -TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(1); +void ChannelCloseUnblocksSendersTest(Channel *ch) { + using paddle::framework::details::Buffered; + using paddle::framework::details::UnBuffered; + size_t num_threads = 5; std::thread t[num_threads]; bool thread_ended[num_threads]; @@ -252,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { } std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - // Verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < num_threads; i++) { - if (thread_ended[i] == false) ct++; + if (dynamic_cast *>(ch)) { + // If ch is Buffered, atleast 4 threads must be blocked. + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (!thread_ended[i]) ct++; + } + EXPECT_GE(ct, 4); + } else { + // If ch is UnBuffered, all the threads should be blocked. + for (size_t i = 0; i < num_threads; i++) { + EXPECT_EQ(thread_ended[i], false); + } } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - // Explicitly close the thread // This should unblock all senders CloseChannel(ch); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { EXPECT_EQ(thread_ended[i], true); } - // Verify that only 1 send was successful - ct = 0; - for (size_t i = 0; i < num_threads; i++) { - if (send_success[i]) ct++; + if (dynamic_cast *>(ch)) { + // Verify that only 1 send was successful + int ct = 0; + for (size_t i = 0; i < num_threads; i++) { + if (send_success[i]) ct++; + } + // Only 1 send must be successful + EXPECT_EQ(ct, 1); } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); for (size_t i = 0; i < num_threads; i++) t[i].join(); +} + +// This tests that closing a buffered channel also unblocks +// any receivers waiting on the channel +TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksReceiversTest(ch); + delete ch; +} + +// This tests that closing a buffered channel also unblocks +// any senders waiting for channel to have write space +TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { + auto ch = MakeChannel(1); + ChannelCloseUnblocksSendersTest(ch); delete ch; } @@ -287,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { // unblocks any receivers waiting for senders TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { auto ch = MakeChannel(0); - size_t num_threads = 5; - std::thread t[num_threads]; - bool thread_ended[num_threads]; - - // Launches threads that try to read and are blocked becausew of no writers - for (size_t i = 0; i < num_threads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < num_threads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the thread - // This should unblock all receivers - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < num_threads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < num_threads; i++) t[i].join(); + ChannelCloseUnblocksReceiversTest(ch); delete ch; } @@ -328,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { // unblocks any senders waiting for senders TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { auto ch = MakeChannel(0); - size_t num_threads = 5; - std::thread t[num_threads]; - bool thread_ended[num_threads]; - - // Launches threads that try to read and are blocked becausew of no writers - for (size_t i = 0; i < num_threads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data = 10; - EXPECT_EQ(ch->Send(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < num_threads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the thread - // This should unblock all receivers - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < num_threads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < num_threads; i++) t[i].join(); + ChannelCloseUnblocksReceiversTest(ch); delete ch; } diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h index 77eebc9924954b8adbbadceb4ede57f0a21f05aa..227a4e4811f95441158150396b5b882815fd7844 100644 --- a/paddle/framework/details/buffered_channel.h +++ b/paddle/framework/details/buffered_channel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,6 +25,14 @@ namespace paddle { namespace framework { namespace details { +// Four of the properties of Buffered Channel: +// - A send to a full channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from an empty channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. + template class Buffered : public paddle::framework::Channel { friend Channel* paddle::framework::MakeChannel(size_t); diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h index 7e308ffb5a49876aa2c1833b3b7e2a2c7eb137aa..69bcea625288eba897e761a1d634f19c41dc0f79 100644 --- a/paddle/framework/details/cow_ptr.h +++ b/paddle/framework/details/cow_ptr.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc index 936954a2333e7e5d2a932abad641279db9ef7b9f..1f4a12bca0dcab2d146cc62cd7ce1c2d7abcddf9 100644 --- a/paddle/framework/details/cow_ptr_test.cc +++ b/paddle/framework/details/cow_ptr_test.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h index 6d50e820b2b625f932768d2ca671d999071f1ca6..31a40bcbcb3905f01aebefe89526f3cfba8cb8c7 100644 --- a/paddle/framework/details/op_registry.h +++ b/paddle/framework/details/op_registry.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h index 92a16b4d22bbb6a8c75157444aa8474f700603fe..6b5c2196cb2991051c48f7da8397d2f479ca4c58 100644 --- a/paddle/framework/details/unbuffered_channel.h +++ b/paddle/framework/details/unbuffered_channel.h @@ -23,6 +23,13 @@ namespace paddle { namespace framework { namespace details { +// Four of the properties of UnBuffered Channel: +// - A send to a channel blocks temporarily until a receive from the +// channel or the channel is closed. +// - A receive from a channel blocks temporarily until a send to the +// channel or the channel is closed. +// - A send to a closed channel returns false immediately. +// - A receive from a closed channel returns false immediately. template class UnBuffered : public paddle::framework::Channel { friend Channel* paddle::framework::MakeChannel(size_t); diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h index 2082f8bb76fb62bc36f033fecbd4eaa76d12d949..f51753453bee255a0592e9c5b0fb2d9aa380e109 100644 --- a/paddle/framework/grad_op_desc_maker.h +++ b/paddle/framework/grad_op_desc_maker.h @@ -122,6 +122,11 @@ class GradOpDescMakerBase { return it->second; } + template + inline const T& Attr(const std::string& name) const { + return boost::get(GetAttr(name)); + } + std::string ForwardOpType() const { return this->fwd_op_.Type(); } private: diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index be2b301619639106ac7b578e5a79cf33f4379e48..9de454428d9fd733aa70601f5012e77b9ceb2022 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -46,29 +46,7 @@ namespace framework { * 0 2 4 7 * 0 2 5 7 10 12 15 20 */ -struct LoD : public std::vector> { - using std::vector>::vector; - platform::Place place() const { - if (this->size() == 0) { - // Not Initialze Yet. - return platform::CPUPlace(); - } else { - return this->front().place(); - } - } - - void CopyFromCUDA() { - for (auto it = this->begin(); it != this->end(); ++it) { - it->CopyFromCUDA(); - } - } - - void CopyToPeer(platform::Place place) { - for (auto it = this->begin(); it != this->end(); ++it) { - it->CopyToPeer(place); - } - } -}; +using LoD = std::vector>; std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoDTensor& t); diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu index adea02e3b3fdcf4873de76ff91116f43ac9fe259..a28b7caf86c689d55808c4e7defecd37a5a03442 100644 --- a/paddle/framework/lod_tensor_test.cu +++ b/paddle/framework/lod_tensor_test.cu @@ -20,6 +20,7 @@ #include "paddle/platform/assert.h" #include +#include __global__ void test(size_t* a, int size) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; @@ -36,10 +37,9 @@ TEST(LoD, data) { lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); auto& v = lod[0]; - test<<<1, 1>>>(v.cuda_data(), v.size()); + paddle::platform::CUDAPlace gpu(0); + test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); cudaDeviceSynchronize(); - - v.CopyFromCUDA(); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(v[i], i * 2); } @@ -63,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) { auto lod = lod_tensor.lod(); - test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size()); + test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); cudaDeviceSynchronize(); - lod.CopyFromCUDA(); for (size_t i = 0; i < src_lod[0].size(); ++i) { EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h index 5202775515d335ff81bb17e6ce21338c40041ca3..f776f0317a2bed69cf8795c2a12a467c50ba38d3 100644 --- a/paddle/framework/mixed_vector.h +++ b/paddle/framework/mixed_vector.h @@ -17,176 +17,347 @@ #include #include -#include "paddle/memory/memcpy.h" -#include "paddle/memory/memory.h" -#include "paddle/platform/device_context.h" -#include "paddle/platform/enforce.h" -#include "paddle/platform/place.h" +#include "paddle/framework/tensor.h" +#include "paddle/framework/tensor_util.h" + +#include "glog/logging.h" namespace paddle { namespace framework { -/** - * @brief Vector support both cpu and gpu. - * host vector lifetime is same with Vector - * device vector is lazily malloc and modified. - */ - +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. template -class Vector : public std::vector { +class Vector { public: - using std::vector::vector; + using value_type = T; + + // Default ctor. Create empty Vector + Vector() { InitEmpty(); } + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T& value = T()) { + if (count == 0) { + InitEmpty(); + } else { + resize(count); + T* ptr = begin(); + for (size_t i = 0; i < count; ++i) { + ptr[i] = value; + } + } + } + + // Ctor with init_list + Vector(std::initializer_list init) { + if (init.size() == 0) { + InitEmpty(); + } else { + InitByIter(init.size(), init.begin(), init.end()); + } + } + + // implicit cast from std::vector. + template + Vector(const std::vector& dat) { // NOLINT + if (dat.size() == 0) { + InitEmpty(); + } else { + InitByIter(dat.size(), dat.begin(), dat.end()); + } + } + + // Copy ctor + Vector(const Vector& other) { this->operator=(other); } + + // Copy operator + Vector& operator=(const Vector& other) { + if (other.size() != 0) { + this->InitByIter(other.size(), other.begin(), other.end()); + } else { + InitEmpty(); + } + return *this; + } + + // Move ctor + Vector(Vector&& other) { + this->size_ = other.size_; + this->flag_ = other.flag_; + if (other.cuda_vec_.memory_size()) { + this->cuda_vec_.ShareDataWith(other.cuda_vec_); + } + if (other.cpu_vec_.memory_size()) { + this->cpu_vec_.ShareDataWith(other.cpu_vec_); + } + } + + // CPU data access method. Mutable. + T& operator[](size_t i) { + MutableCPU(); + return const_cast(cpu_vec_.data())[i]; + } + + // CPU data access method. Immutable. + const T& operator[](size_t i) const { + ImmutableCPU(); + return cpu_vec_.data()[i]; + } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return size_; } + + T* begin() { return &this->operator[](0); } + + T* end() { return &this->operator[](size()); } + + T& front() { return *begin(); } + + T& back() { + auto it = end(); + --it; + return *it; + } + + const T* begin() const { return &this->operator[](0); } + const T* end() const { return &this->operator[](size()); } + + const T& back() const { + auto it = end(); + --it; + return *it; + } + + T* data() { return begin(); } + + const T* data() const { return begin(); } + + const T& front() const { return *begin(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + InitByIter(end - begin, begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + if (size_ + 1 > capacity()) { + reserve((size_ + 1) << 1); + } + *end() = elem; + ++size_; + } - Vector() {} - Vector(const std::vector &v) : std::vector(v) {} // NOLINT + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + size_t pre_size = size_; + resize(pre_size + (end - begin)); + T* ptr = this->begin() + pre_size; + for (; begin < end; ++begin, ++ptr) { + *ptr = *begin; + } + } - inline platform::Place place() const { return place_; } + // resize the vector + void resize(size_t size) { + if (size + 1 < capacity()) { + size_ = size; + } else { + MutableCPU(); + Tensor cpu_tensor; + platform::Place cpu = platform::CPUPlace(); + T* ptr = cpu_tensor.mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + const T* old_ptr = + cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + size_, ptr); + } + size_ = size; + cpu_vec_.ShareDataWith(cpu_tensor); + } + } - /*! Return a pointer to constant memory block. */ - inline const T *data(platform::Place place) const; + // get cuda ptr. immutable + const T* CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return cuda_vec_.data(); + } - /*! Return a pointer to mutable memory block. */ - inline T *mutable_data(platform::Place place); + // get cuda ptr. mutable + T* CUDAMutableData(platform::Place place) { + const T* ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } - // TODO(dzhwinter): below interfaces should be removed - /* Get device vector */ - T *cuda_data() { - CopyToCUDA(); - PADDLE_ENFORCE_NOT_NULL( - cuda_ptr_, "No data or Insufficient CUDA memory to allocation"); - return static_cast(cuda_ptr_.get()); + // clear + void clear() { + size_ = 0; + flag_ = kDirty | kDataInCPU; } - /* Get host vector */ - T *data() { return std::vector::data(); } - const T *data() const { return std::vector::data(); } + size_t capacity() const { + return cpu_vec_.memory_size() / SizeOfType(typeid(T)); + } + + // reserve data + void reserve(size_t size) { + size_t pre_size = size_; + resize(size); + resize(pre_size); + } - T *data(const platform::Place &place) { - if (platform::is_cpu_place(place)) { + // the unify method to access CPU or CUDA data. immutable. + const T* Data(platform::Place place) const { + if (platform::is_gpu_place(place)) { + return CUDAData(place); + } else { return data(); + } + } + + // the unify method to access CPU or CUDA data. mutable. + T* MutableData(platform::Place place) { + if (platform::is_gpu_place(place)) { + return CUDAMutableData(place); } else { - return cuda_data(); + return data(); } } - /* Synchronize host vector to device vector */ - void CopyToCUDA(); - /* Synchronize device vector to host vector */ - void CopyFromCUDA(); - /* Switch device vector location */ - void CopyToPeer(platform::Place); + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + std::vector result; + result.resize(size()); + std::copy(begin(), end(), result.begin()); + return result; + } + + bool operator==(const Vector& other) const { + if (size() != other.size()) return false; + for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + return true; + } private: - std::shared_ptr cuda_ptr_; - size_t cuda_size_ = 0; // device vector numel - platform::CUDAPlace place_; -}; + void InitEmpty() { + size_ = 0; + flag_ = kDataInCPU; + } -template -inline const T *Vector::data(platform::Place place) const { - if (platform::is_cpu_place(place)) { - return std::vector::data(); - } else if (platform::is_gpu_place(place)) { - if (cuda_ptr_ == nullptr) { - return nullptr; - } - if (boost::get(place) == place_) { - return static_cast(cuda_ptr_.get()); + template + void InitByIter(size_t size, Iter begin, Iter end) { + platform::Place cpu = platform::CPUPlace(); + T* ptr = this->cpu_vec_.template mutable_data( + framework::make_ddim({static_cast(size)}), cpu); + for (size_t i = 0; i < size; ++i) { + *ptr++ = *begin++; + } + flag_ = kDataInCPU | kDirty; + size_ = size; + } + + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); + WaitPlace(cuda_vec_.place()); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == cuda_vec_.place())) { + framework::Tensor tmp; + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + cuda_vec_.ShareDataWith(tmp); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } } else { - PADDLE_THROW( - "Unmatched place. Please use `mutable_data` copy lod to the target " - "Place first."); + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + Copy(cpu_vec_, boost::get(place), &cuda_vec_); + WaitPlace(place); + SetFlag(kDataInCUDA); + } else if (!(place == cuda_vec_.place())) { + framework::Tensor tmp; + WaitPlace(cuda_vec_.place()); + Copy(cuda_vec_, boost::get(place), &tmp); + WaitPlace(cuda_vec_.place()); + WaitPlace(place); + cuda_vec_.ShareDataWith(tmp); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } } - } else { - PADDLE_THROW("Unsupport Place."); } -} -template -inline T *Vector::mutable_data(platform::Place place) { - if (platform::is_cpu_place(place)) { - return std::vector::data(); - } else if (platform::is_gpu_place(place)) { - if (boost::get(place) != place_) { - place_ = boost::get(place); - } -#ifdef PADDLE_WITH_CUDA - if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { - cuda_ptr_.reset( - memory::Alloc(place_, this->size() * sizeof(T)), - memory::PlainDeleter(place_)); - } - cuda_size_ = this->size(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *ctx = pool.GetByPlace(place_); - memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), - static_cast(this->data()), - this->size() * sizeof(T), ctx->stream()); - ctx->Wait(); - return static_cast(cuda_ptr_.get()); -#else - return nullptr; -#endif - } else { - PADDLE_THROW("Unsupport Place."); - } -} + void ImmutableCPU() const { + if (IsDirty() && + !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } -template -void Vector::CopyToCUDA() { -#ifdef PADDLE_WITH_CUDA - if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { - cuda_ptr_.reset( - memory::Alloc(place_, this->size() * sizeof(T)), - memory::PlainDeleter(place_)); - } - cuda_size_ = this->size(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *ctx = pool.GetByPlace(place_); - memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), - static_cast(this->data()), - this->size() * sizeof(T), ctx->stream()); - ctx->Wait(); -#endif -} + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } -template -void Vector::CopyFromCUDA() { -#ifdef PADDLE_WITH_CUDA - if (cuda_ptr_ == nullptr) { - LOG(WARNING) << "No uncommitted cuda data."; - return; - } - this->resize(cuda_size_); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *ctx = pool.GetByPlace(place_); - memory::Copy(platform::CPUPlace(), static_cast(this->data()), place_, - static_cast(cuda_ptr_.get()), - this->size() * sizeof(T), ctx->stream()); - ctx->Wait(); -#endif -} + bool IsDirty() const { return flag_ & kDirty; } -template -void Vector::CopyToPeer(platform::Place place) { -#ifdef PADDLE_WITH_CUDA - if (boost::get(place) != place_) { - place_ = boost::get(place); - } - if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { - cuda_ptr_.reset( - memory::Alloc(place_, this->size() * sizeof(T)), - memory::PlainDeleter(place_)); - } - cuda_size_ = this->size(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *ctx = pool.GetByPlace(place_); - memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), - static_cast(this->data()), - this->size() * sizeof(T), ctx->stream()); - ctx->Wait(); -#endif -} + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + static void WaitPlace(const platform::Place place) { + if (platform::is_gpu_place(place)) { + platform::DeviceContextPool::Instance() + .Get(boost::get(place)) + ->Wait(); + } + } + + mutable int flag_; + mutable Tensor cpu_vec_; + mutable Tensor cuda_vec_; + size_t size_; +}; } // namespace framework } // namespace paddle diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu index 7b571788ad1ade50e05dc9a70cba35b83f8db3ea..f02db8f612c498ddd103a6b96e80d265e13c0f52 100644 --- a/paddle/framework/mixed_vector_test.cu +++ b/paddle/framework/mixed_vector_test.cu @@ -11,62 +11,83 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include -#include "gtest/gtest.h" -#include "paddle/framework/init.h" +#include "glog/logging.h" +#include "gtest/gtest.h" #include "paddle/framework/mixed_vector.h" - -using namespace paddle::framework; -using namespace paddle::platform; -using namespace paddle::memory; +#include "paddle/platform/gpu_info.h" template -__global__ void test(T* data, int size) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; - i += blockDim.x * gridDim.x) { - data[i] *= 2; +using vec = paddle::framework::Vector; + +TEST(mixed_vector, CPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10); + vec tmp2; + tmp2 = tmp; + ASSERT_EQ(tmp2.size(), 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp2[i], i); + ASSERT_EQ(tmp2[i], tmp[i]); + } + int cnt = 0; + for (auto& t : tmp2) { + ASSERT_EQ(t, cnt); + ++cnt; } } -TEST(Vector, Normal) { - // fill the device context pool. - InitDevices(); +static __global__ void multiply_10(int* ptr) { + for (int i = 0; i < 10; ++i) { + ptr[i] *= 10; + } +} + +cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { + return reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); +} - Vector vec({1, 2, 3}); - size_t* ptr = vec.data(); - for (size_t i = 0; i < vec.size(); ++i) { - EXPECT_EQ(vec[i], *(ptr + i)); +TEST(mixed_vector, GPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu(0); - vec.clear(); - vec.CopyFromCUDA(); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); - std::vector v = {1, 2, 3}; - for (size_t i = 0; i < v.size(); ++i) { - EXPECT_EQ(v[i], vec[i]); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 10); } } -TEST(Vector, MultipleCopy) { - InitDevices(); - Vector vec({1, 2, 3}); - CUDAPlace place(0); - vec.mutable_data(place); - auto vec2 = Vector(vec); - { - const size_t* ptr = vec2.data(CPUPlace()); - for (size_t i = 0; i < vec2.size(); ++i) { - EXPECT_EQ(*(ptr + i), vec[i]); - } +TEST(mixed_vector, MultiGPU) { + if (paddle::platform::GetCUDADeviceCount() < 2) { + LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " + "GPUs in your machine."; + return; + } + + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); } - test<<<3, 3>>>(vec2.mutable_data(place), vec2.size()); - vec2.CopyFromCUDA(); - { - const size_t* ptr = vec2.data(CPUPlace()); - for (size_t i = 0; i < vec2.size(); ++i) { - EXPECT_EQ(*(ptr + i), vec[i] * 2); - } + ASSERT_EQ(tmp.size(), 10); + paddle::platform::CUDAPlace gpu0(0); + paddle::platform::SetDeviceId(0); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + paddle::platform::CUDAPlace gpu1(1); + auto* gpu1_ptr = tmp.MutableData(gpu1); + paddle::platform::SetDeviceId(1); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 100); } } diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index ea4028750248ec47f5094a67f736fb217216af6d..b51afe499bbc0e6b727aeeb4334f56e400ea81a5 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -125,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block) // restore attrs_ for (const proto::OpDesc::Attr &attr : desc_.attrs()) { std::string attr_name = attr.name(); + // The sub_block referred to by the BLOCK attr hasn't been added + // to ProgramDesc class yet, we skip setting BLOCK attr here. if (attr.type() != proto::AttrType::BLOCK) { attrs_[attr_name] = GetAttrValue(attr); - } else { - auto bid = attr.block_idx(); - attrs_[attr_name] = prog->MutableBlock(bid); } } this->block_ = block; diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 5de9ae559c435439f30931c7840e54e0d2bb744c..6fb8532b2a807e287de7fab03402f8a290eabcf2 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar { /** * Macro to register Operator. When the input is duplicable, you should - * use REGISTER_OP_EX with deop_empty_grad=false instead. + * use REGISTER_OP_EX with drop_empty_grad=false instead. */ #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ grad_op_class) \ diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc index 15ea4035c6e6193105b621210a900e74d1466941..0e937dda4e185590648962a6d4f827eea21eb620 100644 --- a/paddle/framework/program_desc.cc +++ b/paddle/framework/program_desc.cc @@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() { ProgramDesc::ProgramDesc(const ProgramDesc &o) { desc_ = o.desc_; - for (int i = 0; i < desc_.blocks_size(); ++i) { auto *block = desc_.mutable_blocks(i); blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } } ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { @@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { for (auto &block_desc : *desc_.mutable_blocks()) { blocks_.emplace_back(new BlockDesc(this, &block_desc)); } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx)); + } + } + } + } } ProgramDesc::ProgramDesc(const std::string &binary_str) { diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc index bff8e0bceaca9749101b2c45edddba526d565624..ddd6b993d40f72cba919fad95318f70409c98bca 100644 --- a/paddle/framework/prune.cc +++ b/paddle/framework/prune.cc @@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) { return false; } -void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, - int block_id) { - // TODO(tonyyang-svail): - // - will change to use multiple blocks for RNN op and Cond Op +int GetSubBlockIndex(const proto::OpDesc& op_desc) { + for (auto& attr : op_desc.attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + PADDLE_ENFORCE(attr.has_block_idx()); + return attr.block_idx(); + } + } + return -1; +} + +bool HasSubBlock(const proto::OpDesc& op_desc) { + return GetSubBlockIndex(op_desc) > 0; +} +// block_id is the idx of the current block in the input desc +// parent_block_id is the idx of the parent of the current block +// in the output desc, -1 means the current block is global block +// dependent_vars is passed recursively from the parent block to +// the child block to help pruning +void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, + int block_id, int parent_block_id, + std::set& dependent_vars) { auto& block = input.blocks(block_id); auto& ops = block.ops(); @@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, expect_fetch = (op_desc.type() == kFetchOpType); } - std::set dependent_vars; std::vector should_run; for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { auto& op_desc = *op_iter; - if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) { // insert its input to the dependency graph for (auto& var : op_desc.inputs()) { @@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, dependent_vars.insert(argu); } } - should_run.push_back(true); } else { should_run.push_back(false); @@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, // we reverse the should_run vector std::reverse(should_run.begin(), should_run.end()); - *output = input; - auto* op_field = output->mutable_blocks(block_id)->mutable_ops(); + // copy the current block from input to output + auto* block_field = output->mutable_blocks(); + *block_field->Add() = input.blocks(block_id); + + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + output_block->set_idx(output_block_id); + output_block->set_parent_idx(parent_block_id); + + auto* op_field = output_block->mutable_ops(); op_field->Clear(); for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { - *op_field->Add() = input.blocks(block_id).ops(i); + auto* op = op_field->Add(); + *op = input.blocks(block_id).ops(i); + if (HasSubBlock(*op)) { + // create sub_block_dependent_vars here to help prune the sub block + std::set sub_block_dependent_vars; + for (auto& var : op->inputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + for (auto& var : op->outputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc + // output_block_id is the idx of the current block in the output desc + prune_impl(input, output, GetSubBlockIndex(*op), output_block_id, + sub_block_dependent_vars); + } } } // remove the VarDescs in BlockDesc that are not referenced in // the pruned OpDescs std::unordered_map var_map; - auto* var_field = output->mutable_blocks(block_id)->mutable_vars(); + auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars(); for (const auto& var : *var_field) { var_map[var.name()] = var; } - var_field->Clear(); + std::set var_names; for (const auto& op : *op_field) { - // add VarDescs of all input arguments for each OpDesc auto& input_field = op.inputs(); for (auto& input_var : input_field) { for (auto& arg : input_var.arguments()) { - *var_field->Add() = var_map[arg]; + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } } } - // add VarDescs of all output arguments for each OpDesc auto& output_field = op.outputs(); for (auto& output_var : output_field) { for (auto& arg : output_var.arguments()) { - *var_field->Add() = var_map[arg]; + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } } } } + + var_field->Clear(); + for (const auto& name : var_names) { + *var_field->Add() = var_map[name]; + } } // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { - prune_impl(input, output, 0); + std::set dependent_vars; + output->clear_blocks(); + prune_impl(input, output, 0, -1, dependent_vars); } void inference_optimize_impl(const proto::ProgramDesc& input, diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index f0ea709a5c37e769e3ffa1b2e9d1e39721979251..be09b7c94507b99b5b4cbfe6f2039c74ec76b4a2 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -120,6 +120,7 @@ class Tensor { return holder_->type(); } + // memory size returns the holding memory size in byte. size_t memory_size() const; inline void check_memory_size() const; diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 1340c5e48520ccdd537e694abf452fd79129df99..652d6b8a90ebb8e761aa7ab01902802d93bfecae 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -52,7 +52,7 @@ struct SizeOfTypeFunctor { }; static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor functor; + SizeOfTypeFunctor functor; size_t size = functor(type); PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); return size; @@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) { inline void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE_GE( - holder_->size(), memory_size() + offset_, + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), "Tensor's dims_ is out of bound. Call Tensor::mutable_data " "first to re-allocate memory.\n" "or maybe the required data-type mismatches the data already stored."); } inline size_t Tensor::memory_size() const { - return holder_ == nullptr ? 0UL : numel() * SizeOfType(type()); + return holder_ == nullptr ? 0UL : holder_->size() - offset_; } template diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt index 654a6119bdc85f43b0cae631a9dc8f0ccd758889..bdb147955ca0700dc0854b54c38d961caf8845f3 100644 --- a/paddle/inference/CMakeLists.txt +++ b/paddle/inference/CMakeLists.txt @@ -4,19 +4,14 @@ cc_library(paddle_fluid_api SRCS io.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) -# Merge all modules into a single static library +# Create static library cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) # Create shared library -add_library(paddle_fluid_shared SHARED io.cc) - -target_circle_link_libraries(paddle_fluid_shared - ARCHIVE_START - ${GLOB_OP_LIB} - ${FLUID_CORE_MODULES} - ARCHIVE_END) - -SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) +cc_library(paddle_fluid_shared SHARED + SRCS io.cc + DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END) +set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(WITH_TESTING) add_subdirectory(tests/book) diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc index 1ed14b69c83a7a0fb5a55db9c179df133407440c..784e87970f77857e7f3182df904dc0133c44d6c9 100644 --- a/paddle/inference/io.cc +++ b/paddle/inference/io.cc @@ -21,6 +21,17 @@ limitations under the License. */ namespace paddle { namespace inference { +void ReadBinaryFile(const std::string& filename, std::string& contents) { + VLOG(3) << "loading model from " << filename; + std::ifstream inputfs(filename, std::ios::in | std::ios::binary); + inputfs.seekg(0, std::ios::end); + contents.clear(); + contents.resize(inputfs.tellg()); + inputfs.seekg(0, std::ios::beg); + inputfs.read(&contents[0], contents.size()); + inputfs.close(); +} + bool IsParameter(const framework::VarDesc* var, const framework::ProgramDesc& main_program) { if (var->Persistable()) { @@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var, void LoadPersistables(framework::Executor& executor, framework::Scope& scope, + const framework::ProgramDesc& main_program, const std::string& dirname, - const framework::ProgramDesc& main_program) { + const std::string& param_filename) { const framework::BlockDesc& global_block = main_program.Block(0); framework::ProgramDesc* load_program = new framework::ProgramDesc(); framework::BlockDesc* load_block = load_program->MutableBlock(0); + std::vector paramlist; + for (auto* var : global_block.AllVars()) { if (IsParameter(var, main_program)) { VLOG(3) << "parameter's name: " << var->Name(); @@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor, new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetPersistable(true); - // append_op - framework::OpDesc* op = load_block->AppendOp(); - op->SetType("load"); - op->SetOutput("Out", {new_var->Name()}); - op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); - op->CheckAttrs(); + if (!param_filename.empty()) { + paramlist.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); + op->CheckAttrs(); + } } } + + if (!param_filename.empty()) { + // sort paramlist to have consistent ordering + std::sort(paramlist.begin(), paramlist.end()); + // append just the load_combine op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", paramlist); + op->SetAttr("file_path", {param_filename}); + op->CheckAttrs(); + } + executor.Run(*load_program, &scope, 0, true, true); + + VLOG(3) << "Ran loading successfully"; delete load_program; } @@ -77,20 +109,29 @@ std::unique_ptr Load(framework::Executor& executor, framework::Scope& scope, const std::string& dirname) { std::string model_filename = dirname + "/__model__"; - LOG(INFO) << "loading model from " << model_filename; - std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary); std::string program_desc_str; - inputfs.seekg(0, std::ios::end); - program_desc_str.resize(inputfs.tellg()); - inputfs.seekg(0, std::ios::beg); - LOG(INFO) << "program_desc_str's size: " << program_desc_str.size(); - inputfs.read(&program_desc_str[0], program_desc_str.size()); - inputfs.close(); + ReadBinaryFile(model_filename, program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + + LoadPersistables(executor, scope, *main_program, dirname, ""); + return main_program; +} + +std::unique_ptr Load( + framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename) { + std::string model_filename = prog_filename; + std::string program_desc_str; + ReadBinaryFile(model_filename, program_desc_str); std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); - LoadPersistables(executor, scope, dirname, *main_program); + LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; } diff --git a/paddle/inference/io.h b/paddle/inference/io.h index 962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac..a7d7c499690620740d8627e7f5085d728d67f7c3 100644 --- a/paddle/inference/io.h +++ b/paddle/inference/io.h @@ -26,12 +26,18 @@ namespace inference { void LoadPersistables(framework::Executor& executor, framework::Scope& scope, + const framework::ProgramDesc& main_program, const std::string& dirname, - const framework::ProgramDesc& main_program); + const std::string& param_filename); std::unique_ptr Load(framework::Executor& executor, framework::Scope& scope, const std::string& dirname); +std::unique_ptr Load(framework::Executor& executor, + framework::Scope& scope, + const std::string& prog_filename, + const std::string& param_filename); + } // namespace inference } // namespace paddle diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt index 63afeb18aebdf446c01cd4fdac13d238467801e4..9fe76afb582a13b741ab086f0c62d77e86d4e8bb 100644 --- a/paddle/inference/tests/book/CMakeLists.txt +++ b/paddle/inference/tests/book/CMakeLists.txt @@ -5,25 +5,30 @@ function(inference_test TARGET_NAME) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests) + set(arg_list "") if(inference_test_ARGS) foreach(arg ${inference_test_ARGS}) - cc_test(test_inference_${TARGET_NAME}_${arg} - SRCS test_inference_${TARGET_NAME}.cc - DEPS ARCHIVE_START paddle_fluid ARCHIVE_END - ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model) - set_tests_properties(test_inference_${TARGET_NAME}_${arg} - PROPERTIES DEPENDS test_${TARGET_NAME}) + list(APPEND arg_list "_${arg}") endforeach() else() - cc_test(test_inference_${TARGET_NAME} + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_inference_${TARGET_NAME}${arg} SRCS test_inference_${TARGET_NAME}.cc DEPS ARCHIVE_START paddle_fluid ARCHIVE_END - ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model) - set_tests_properties(test_inference_${TARGET_NAME} + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) + set_tests_properties(test_inference_${TARGET_NAME}${arg} PROPERTIES DEPENDS test_${TARGET_NAME}) - endif() + endforeach() endfunction(inference_test) -inference_test(recognize_digits ARGS mlp) +inference_test(fit_a_line) inference_test(image_classification ARGS vgg resnet) inference_test(label_semantic_roles) +inference_test(recognize_digits ARGS mlp) +inference_test(recommender_system) +inference_test(rnn_encoder_decoder) +inference_test(understand_sentiment) +inference_test(word2vec) diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h index 32db643fca2b026b674ea0b1ecd9aad5224e9e68..9774f3fbcb4af2bdf40f28456380f7e96189641c 100644 --- a/paddle/inference/tests/book/test_helper.h +++ b/paddle/inference/tests/book/test_helper.h @@ -30,6 +30,15 @@ void SetupTensor(paddle::framework::LoDTensor& input, } } +template +void SetupTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + std::vector& data) { + CHECK_EQ(paddle::framework::product(dims), static_cast(data.size())); + T* input_ptr = input.mutable_data(dims, paddle::platform::CPUPlace()); + memcpy(input_ptr, data.data(), input.numel() * sizeof(T)); +} + template void SetupLoDTensor(paddle::framework::LoDTensor& input, paddle::framework::LoD& lod, @@ -37,7 +46,18 @@ void SetupLoDTensor(paddle::framework::LoDTensor& input, T upper) { input.set_lod(lod); int dim = lod[0][lod[0].size() - 1]; - SetupTensor(input, {dim, 1}, lower, upper); + SetupTensor(input, {dim, 1}, lower, upper); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor& input, + paddle::framework::DDim dims, + paddle::framework::LoD lod, + std::vector& data) { + const size_t level = lod.size() - 1; + CHECK_EQ(dims[0], static_cast((lod[level]).back())); + input.set_lod(lod); + SetupTensor(input, dims, data); } template @@ -64,20 +84,35 @@ void CheckError(paddle::framework::LoDTensor& output1, count++; } } - EXPECT_EQ(count, 0) << "There are " << count << " different elements."; + EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, std::vector& cpu_fetchs) { - // 1. Define place, executor and scope + // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); auto* scope = new paddle::framework::Scope(); - // 2. Initialize the inference_program and load all parameters from file - auto inference_program = paddle::inference::Load(executor, *scope, dirname); + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + if (IsCombined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // Users are free to specify different filename + // (provided: the filenames are changed in the python api as well: io.py) + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } // 3. Get the feed_target_names and fetch_target_names const std::vector& feed_target_names = diff --git a/paddle/inference/tests/book/test_inference_fit_a_line.cc b/paddle/inference/tests/book/test_inference_fit_a_line.cc new file mode 100644 index 0000000000000000000000000000000000000000..201a2801cd69f1ce43173c0072fb35a85aac54e6 --- /dev/null +++ b/paddle/inference/tests/book/test_inference_fit_a_line.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, fit_a_line) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // The second dim of the input tensor should be 13 + // The input data should be >= 0 + int64_t batch_size = 10; + SetupTensor( + input, {batch_size, 13}, static_cast(0), static_cast(10)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc index 35ff9431e9734bc3d20e1281f9d5d7f3e98f7524..36ea7c77a75fc0540922eb0f9eb3899733a0afa2 100644 --- a/paddle/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/inference/tests/book/test_inference_image_classification.cc @@ -29,11 +29,15 @@ TEST(inference, image_classification) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + int64_t batch_size = 1; + paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [0.0, 1.0]. - SetupTensor( - input, {1, 3, 32, 32}, static_cast(0), static_cast(1)); + SetupTensor(input, + {batch_size, 3, 32, 32}, + static_cast(0), + static_cast(1)); std::vector cpu_feeds; cpu_feeds.push_back(&input); @@ -42,8 +46,7 @@ TEST(inference, image_classification) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference( - dirname, cpu_feeds, cpu_fetchs1); + TestInference(dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -52,8 +55,7 @@ TEST(inference, image_classification) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference( - dirname, cpu_feeds, cpu_fetchs2); + TestInference(dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc index 1eaf4022a1f27235fdd07e77e294eaba37a14249..922dbfd3338433a58632592667307e4da4dac9da 100644 --- a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc +++ b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc @@ -58,8 +58,7 @@ TEST(inference, label_semantic_roles) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference( - dirname, cpu_feeds, cpu_fetchs1); + TestInference(dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << output1.lod(); LOG(INFO) << output1.dims(); @@ -69,8 +68,7 @@ TEST(inference, label_semantic_roles) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference( - dirname, cpu_feeds, cpu_fetchs2); + TestInference(dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << output2.lod(); LOG(INFO) << output2.dims(); diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc index 48f887e6bc680087af4cce74b5c5422a4eba3726..af8c2b14c3b1651a3714de10422a1b5dd8e1519f 100644 --- a/paddle/inference/tests/book/test_inference_recognize_digits.cc +++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc @@ -29,6 +29,50 @@ TEST(inference, recognize_digits) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + int64_t batch_size = 1; + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor(input, + {batch_size, 1, 28, 28}, + static_cast(-1), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} + +TEST(inference, recognize_digits_combine) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [-1.0, 1.0]. @@ -42,7 +86,7 @@ TEST(inference, recognize_digits) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference( + TestInference( dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << output1.dims(); @@ -52,7 +96,7 @@ TEST(inference, recognize_digits) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference( + TestInference( dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << output2.dims(); diff --git a/paddle/inference/tests/book/test_inference_recommender_system.cc b/paddle/inference/tests/book/test_inference_recommender_system.cc new file mode 100644 index 0000000000000000000000000000000000000000..ec24c7e6ab7d1573b02bed294f43053ee53e4e57 --- /dev/null +++ b/paddle/inference/tests/book/test_inference_recommender_system.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recommender_system) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id, + category_id, movie_title; + + // Use the first data from paddle.dataset.movielens.test() as input + std::vector user_id_data = {1}; + SetupTensor(user_id, {batch_size, 1}, user_id_data); + + std::vector gender_id_data = {1}; + SetupTensor(gender_id, {batch_size, 1}, gender_id_data); + + std::vector age_id_data = {0}; + SetupTensor(age_id, {batch_size, 1}, age_id_data); + + std::vector job_id_data = {10}; + SetupTensor(job_id, {batch_size, 1}, job_id_data); + + std::vector movie_id_data = {783}; + SetupTensor(movie_id, {batch_size, 1}, movie_id_data); + + std::vector category_id_data = {10, 8, 9}; + SetupLoDTensor(category_id, {3, 1}, {{0, 3}}, category_id_data); + + std::vector movie_title_data = {1069, 4140, 2923, 710, 988}; + SetupLoDTensor(movie_title, {5, 1}, {{0, 5}}, movie_title_data); + + std::vector cpu_feeds; + cpu_feeds.push_back(&user_id); + cpu_feeds.push_back(&gender_id); + cpu_feeds.push_back(&age_id); + cpu_feeds.push_back(&job_id); + cpu_feeds.push_back(&movie_id); + cpu_feeds.push_back(&category_id); + cpu_feeds.push_back(&movie_title); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc new file mode 100644 index 0000000000000000000000000000000000000000..248b9dce217232f1b88d74af9df31648f7779f98 --- /dev/null +++ b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, rnn_encoder_decoder) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word_data, trg_word; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor( + word_data, lod, static_cast(0), static_cast(1)); + SetupLoDTensor( + trg_word, lod, static_cast(0), static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word_data); + cpu_feeds.push_back(&trg_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/inference/tests/book/test_inference_understand_sentiment.cc new file mode 100644 index 0000000000000000000000000000000000000000..1afb644446569fc4fea8d7e9bf806daec016a5b2 --- /dev/null +++ b/paddle/inference/tests/book/test_inference_understand_sentiment.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, understand_sentiment) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, 4, 10}}; + SetupLoDTensor(words, lod, static_cast(0), static_cast(10)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&words); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/inference/tests/book/test_inference_word2vec.cc b/paddle/inference/tests/book/test_inference_word2vec.cc new file mode 100644 index 0000000000000000000000000000000000000000..ca0c040ff629c55b3392d67628ba93226c04a0ce --- /dev/null +++ b/paddle/inference/tests/book/test_inference_word2vec.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, word2vec) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word; + paddle::framework::LoD lod{{0, 1}}; + int64_t dict_size = 2072; // Hard-coding the size of dictionary + + SetupLoDTensor(first_word, lod, static_cast(0), dict_size); + SetupLoDTensor(second_word, lod, static_cast(0), dict_size); + SetupLoDTensor(third_word, lod, static_cast(0), dict_size); + SetupLoDTensor(fourth_word, lod, static_cast(0), dict_size); + + std::vector cpu_feeds; + cpu_feeds.push_back(&first_word); + cpu_feeds.push_back(&second_word); + cpu_feeds.push_back(&third_word); + cpu_feeds.push_back(&fourth_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda..9a21e00b12bc2795e1bf1591f7db60c0245bacd3 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -101,9 +101,9 @@ struct SparseAdagradFunctor { SparseAdagradFunctorKernel< T, 256><<(context) - .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr, - param_data, moment_data, grad_width, - epsilon); + .stream()>>>( + grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr, + param_data, moment_data, grad_width, epsilon); } }; diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index bf536687d398b8342e6ae76a07c11e5fe47483e0..af2c3ecd725ed1c916ff3b8a0291794d35a70e8b 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel { const T* grad_data = grad_tensor.template data(); int64_t* rows = nullptr; if (platform::is_gpu_place(ctx.GetPlace())) { - rows = grad_merge.mutable_rows()->cuda_data(); + rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); } else { rows = grad_merge.mutable_rows()->data(); } diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc index 930c295a9cb31238954efeb87ff5ac2d3ca7bdc6..51b5bcb38f9d60b1246f818de62275dba5b087f9 100644 --- a/paddle/operators/compare_op.cc +++ b/paddle/operators/compare_op.cc @@ -58,8 +58,8 @@ class CompareOpInferShape : public framework::InferShapeBase { comment.type); auto dim_x = context->GetInputDim("X"); auto dim_y = context->GetInputDim("Y"); - PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y), - "The number of elements in X and Y should be same"); + PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(), + "The size of dim_y should not be greater than dim_x's."); context->SetOutputDim("Out", context->GetInputDim("X")); context->ShareLoD("X", "Out"); diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index b275fd75b3512343825170fc38565dd27f7f1c75..79b8c6f59c7ad3d77aa969f6b4f36f8050cfe823 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -62,7 +62,7 @@ class CompareOpKernel z->mutable_data(context.GetPlace()); int axis = context.Attr("axis"); ElementwiseComputeEx(context, x, y, axis, - z); + Functor(), z); } }; diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu index cea595d7c5d461b40198e622abf08248e7ca69e1..6406825d4a5c4538b5e2780efbe5ba86adce5b72 100644 --- a/paddle/operators/ctc_align_op.cu +++ b/paddle/operators/ctc_align_op.cu @@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { auto stream = ctx.cuda_device_context().stream(); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( - num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank, - merge_repeated, dev_out_lod0_ptr, output_data); + num_tokens, tokens, num_seq, + input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, + dev_out_lod0_ptr, output_data); // set output lod std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); diff --git a/paddle/operators/cum_op.h b/paddle/operators/cum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e3813ac9036169345bd69d1d42db4618c6c45ebf --- /dev/null +++ b/paddle/operators/cum_op.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class CumKernel : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + int axis = context.Attr("axis"); + bool exclusive = context.Attr("exclusive"); + bool reverse = context.Attr("reverse"); + auto x_dims = X.dims(); + if (axis == -1) { + axis = x_dims.size() - 1; + } + PADDLE_ENFORCE_LT( + axis, x_dims.size(), + "axis should be less than the dimensiotn of the input tensor"); + Out.mutable_data(context.GetPlace()); + + int pre = 1; + int post = 1; + int mid = x_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + for (int i = axis + 1; i < x_dims.size(); ++i) { + post *= x_dims[i]; + } + + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(mid), x, out, + /* axis= */ 0, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(mid, post), x, out, + /* axis= */ 0, reverse, exclusive); + } + } else { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, + /* axis= */ 1, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, + /* axis= */ 1, reverse, exclusive); + } + } + } + + private: + template + void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, + bool reverse, bool exclusive) const { + if (!reverse) { + out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); + } + } +}; + +template +struct CumsumFunctor { + using ELEMENT_TYPE = T; + template + const typename X::TensorScanSumOp operator()(X x, int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/cumsum_op.cc b/paddle/operators/cumsum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4933cc923d46ad053e16e060d94a39083709bbe1 --- /dev/null +++ b/paddle/operators/cumsum_op.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/cum_op.h" + +namespace paddle { +namespace operators { + +class CumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Cumsum operator"); + AddOutput("Out", "Output of Cumsum operator"); + AddAttr("axis", + "(int, default -1). The dimenstion to accumulate along. " + "-1 means the last dimenstion") + .SetDefault(-1) + .EqualGreaterThan(-1); + AddAttr("exclusive", + "bool, default false). Whether to perform exclusive cumsum") + .SetDefault(false); + AddAttr("reverse", + "bool, default false). If true, the cumsum is performed in " + "the reversed direction") + .SetDefault(false); + AddComment(R"DOC( +The cumulative sum of the elements along a given axis. +By default, the first element of the result is the same of the first element of +the input. If exlusive is true, the first element of the result is 0. +)DOC"); + } +}; + +class CumsumGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("cumsum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", Attr("axis")); + grad_op->SetAttr("reverse", !Attr("reverse")); + grad_op->SetAttr("exclusive", Attr("exclusive")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker); +REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/operators/cumsum_op.cu b/paddle/operators/cumsum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..90661c4269a9fadaba1446e04b3f661f1227a978 --- /dev/null +++ b/paddle/operators/cumsum_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/cum_op.h" + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>) diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index c32288d6984f126f2374a13973541f4f663b25a4..c24f97a85092ff14e8211ca8bc4bb9b155510a2c 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); } }; diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index 07ebade31ff5b3d5c89156e28ff5fa0670a9a842..dc863cc598ec6015067f166b1544a5d20223662a 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + DivFunctor(), z); } }; diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h index 717e45ab31db9b9a6629fb33e17654dbf986d8c5..67efe4e1511e054d54f91b5aa22ce28f222ed20a 100644 --- a/paddle/operators/elementwise_max_op.h +++ b/paddle/operators/elementwise_max_op.h @@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MaxFunctor(), z); } }; diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h index 0de9a91c52b0ab82cd62604de318ce68e56b767d..cf11759404d3342b8a1c0080fa09f6cd57e735db 100644 --- a/paddle/operators/elementwise_min_op.h +++ b/paddle/operators/elementwise_min_op.h @@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MinFunctor(), z); } }; diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index ae7a71e0244dfb8ad3e55683ac081f92bc36bea5..773125f5ca54e7b529df47a2823d56a5ad71e50d 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 213fe1f5a818873e8b666464cb112637261c598c..74abf7c4a58788eb0e53025886f10f5a43021a9e 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -365,10 +365,10 @@ template void ElementwiseComputeEx(const framework::ExecutionContext& ctx, const framework::Tensor* x, - const framework::Tensor* y, int axis, + const framework::Tensor* y, int axis, Functor func, framework::Tensor* z) { TransformFunctor functor( - x, y, z, ctx.template device_context(), Functor()); + x, y, z, ctx.template device_context(), func); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h index 874fd3f09f2afaccfbfca75799cc3448f7393b03..0c5dd031ec46ebecaabb701839c0f69c02678eb0 100644 --- a/paddle/operators/elementwise_pow_op.h +++ b/paddle/operators/elementwise_pow_op.h @@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + PowFunctor(), z); } }; diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index c2749a8e6ba689233dab4f3c72de10bf01f39fab..6a88c5f6b4c869f8ab5b4fa3b112ffc264be7145 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + SubFunctor(), z); } }; diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 1c6d2ae4d05becaeed34d66cad398cc90f9d3ece..d9b774272cb7c9d87140bf30d2eabb44f49b2b7c 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -21,13 +21,6 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using DataLayout = framework::DataLayout; -template -using EigenMatrixMapRowMajor = Eigen::Map< - Eigen::Matrix>; -template -using ConstEigenMatrixMapRowMajor = Eigen::Map< - const Eigen::Matrix>; - class LayerNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Layer Normalization. - Layer Norm has been implemented as discussed in the paper: https://arxiv.org/abs/1607.06450 ... @@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450 } }; -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - auto *output = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - output->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - - auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); - auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); - - auto squre = [](T ele) { return ele * ele; }; - auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; - - mean_map = input_map.rowwise().mean(); - var_map = (input_map - mean_map.replicate(1, right)) - .unaryExpr(squre) - .rowwise() - .mean() - .unaryExpr(add_epslion); - - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): Some thinking about output_map, is it appropriate that - // `output_map` and `input_map` point to the same memory. - auto inv_std = var_map.unaryExpr(inv_std_func); - if (scale && bias) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) + - bias_map.replicate(left, 1); - } else if (scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)); - } else if (bias) { - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) + - bias_map.replicate(left, 1); - } else { - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)); - } - } -}; - class LayerNormGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -193,8 +116,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { // check input PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LayerNormOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Scale"), - "Input(Scale) of LayerNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Mean"), "Input(Mean) of LayerNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Variance"), @@ -237,125 +158,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *mean = ctx.Input("Mean"); - const auto *var = ctx.Input("Variance"); - const auto *scale = ctx.Input("Scale"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); - auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); - d_bias_map = d_y_map.colwise().sum(); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - auto d_scale_map = - EigenMatrixMapRowMajor(d_scale->data(), 1, right); - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // There are two equation to compute d_scale. One uses "Y" and the other - // does not use "Y" - d_scale_map = - ((x_map - mean_map.replicate(1, right)) - .cwiseProduct( - var_map.unaryExpr(inv_std_func).replicate(1, right)) - .cwiseProduct(d_y_map)) - .colwise() - .sum(); - } - - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); - auto triple_product_func = [](T ele) { return ele * ele * ele; }; - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): these code can be refined - if (d_scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)); - // dy_dmean_dx - auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)) - .rowwise() - .sum() - .replicate(1, right); - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(inv_std_func) - .unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } else { - // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map); - // dy_dmean_dx - auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .rowwise() - .sum() - .replicate(1, right); - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(inv_std_func) - .unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } - } - } -}; - } // namespace operators } // namespace paddle @@ -363,8 +165,9 @@ namespace ops = paddle::operators; REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, layer_norm_grad, ops::LayerNormGradOp); REGISTER_OP_CPU_KERNEL( - layer_norm, - ops::LayerNormKernel); + layer_norm, ops::LayerNormKernel, + ops::LayerNormKernel); REGISTER_OP_CPU_KERNEL( layer_norm_grad, - ops::LayerNormGradKernel); + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.cu b/paddle/operators/layer_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..77d13b216f0e8d6d4434742908437f1eb74818c9 --- /dev/null +++ b/paddle/operators/layer_norm_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/layer_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h index bca35b91e6f52d35dee14aac9d080b52914942e3..3c436b89263758bbc0abcd1bb71cef3e1370d2a5 100644 --- a/paddle/operators/layer_norm_op.h +++ b/paddle/operators/layer_norm_op.h @@ -16,19 +16,222 @@ limitations under the License. */ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/elementwise_op_function.h" +#include "paddle/operators/math/math_function.h" + namespace paddle { namespace operators { +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + template class LayerNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto x = *ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x.dims(); + + y->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + x.Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto &dev_ctx = ctx.template device_context(); + math::RowwiseMean row_mean; + + // get mean + row_mean(dev_ctx, x, mean); + + // get variance + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); + row_mean(dev_ctx, out, var); + + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &out); + + if (scale) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); + } + if (bias) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); + } + } }; template class LayerNormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto x = *ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto d_y = *ctx.Input(framework::GradVarName("Y")); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + const auto &x_dims = x.dims(); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + auto &dev_ctx = ctx.template device_context(); + math::ColwiseSum colwise_sum; + + Tensor temp; + Tensor temp_norm; + if (d_scale || d_x) { + x.Resize(matrix_shape); + temp.mutable_data(matrix_shape, ctx.GetPlace()); + + if (!(bias && scale)) { + temp_norm.ShareDataWith(*y); + temp_norm.Resize(matrix_shape); + } else { + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); + } + } + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + framework::DDim vec_shape({left}); + d_x->mutable_data(ctx.GetPlace()); + auto dx_dim = d_x->dims(); + Tensor temp_vec; + temp_vec.mutable_data(vec_shape, ctx.GetPlace()); + + math::RowwiseMean row_mean; + + if (d_scale) { + // dy_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); + framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } else { + // dy_dx + framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), d_x); + d_x->Resize(dx_dim); + } + } }; } // namespace operators diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 07372808bbf078bd2e9b0bb5782b95a046253f46..9684b6d4612c8e134ccad658840bd028a8508085 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { new_rows.resize(ids_dim[0]); auto gpu_place = boost::get(context.GetPlace()); - memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place, + // TODO(yuyang18): Strange code here. + memory::Copy(platform::CPUPlace(), + new_rows.CUDAMutableData(context.GetPlace()), gpu_place, ids_data, ids_dim[0] * sizeof(int64_t), stream); d_table->set_rows(new_rows); diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8..ce0a5f6cff873166e3308a625978ecefaed2aa29 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -331,6 +331,12 @@ template struct RowwiseAdd; template struct ColwiseSum; template struct ColwiseSum; +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index d47a7f818ded61baf31e46ea3b8ae3101324111f..c0a107470a4629506fc06dabc78a4a4716be6649 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -325,6 +325,31 @@ void ColwiseSum::operator()( vector->data()); } +template struct RowwiseSum; +// template struct RowwiseSum; +// TODO(zcd): Following ColwiseSum format, need to confirm. +// The RowwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void RowwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); + framework::Tensor one; + one.mutable_data({size}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[1]), static_cast(in_dims[0]), + 1.0, one.data(), input.data(), 0.0, + vector->data()); +} + +template struct RowwiseMean; +template struct RowwiseMean; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491..cb14d1e57468564710640773fdabd41896c178e0 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -128,6 +128,18 @@ struct ColwiseSum { framework::Tensor* vec); }; +template +struct RowwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseMean { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index de591626df28e2bc3391b609f909612411398247..af4127788af0aaeb99199f7d6e2138a449b9fe51 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -87,6 +87,88 @@ class ColwiseSum { } }; +template +void RowwiseMean::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.mean(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseMean { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), height); + auto inv_size = 1.0 / size; + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum * inv_size; + } + } +}; + +template +void RowwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum; + } + } +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 8a1ebb58c26578f076bf243adfbd51d10c682b99..4e15d01a3071995e1412fed2a451e4ad3f171862 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -128,7 +128,7 @@ struct SelectedRowsAddTo { auto* in2_value = input2->mutable_value(); // concat rows - in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); auto in1_place = input1.place(); PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index acdd87cb3550bc5f3891aed6fefd4301a3395f9f..54a41a67d063fdc9e4453cb339d92d4a406cecc2 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -126,7 +126,8 @@ struct SelectedRowsAddTensor { dim3 grid(1, in1_rows.size()); SelectedRowsAddTensorKernel< T, block_size><<>>( - in1_data, in1_rows.cuda_data(), out_data, in1_row_numel); + in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, + in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); @@ -153,7 +154,9 @@ struct SelectedRowsAddTo { auto* in2_value = input2->mutable_value(); // concat rows - in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); + if (in1_rows.size()) { + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + } auto in1_place = input1.place(); PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); @@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor { dim3 grid(1, in1_rows.size()); SelectedRowsAddToTensorKernel< T, block_size><<>>( - in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel); + in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_row_numel); } }; @@ -283,9 +287,10 @@ struct MergeAdd { MergeAddKernel< T, 256><<(context) - .stream()>>>(input_data, input_rows.cuda_data(), out_data, - out.mutable_rows()->cuda_data(), - out.rows().size(), input_width); + .stream()>>>( + input_data, input_rows.CUDAData(context.GetPlace()), out_data, + out.mutable_rows()->CUDAMutableData(context.GetPlace()), + out.rows().size(), input_width); return out; } }; diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index f27631271a42b4d64abef00d7f119b85e32edda4..eaed2c30a80c75d56aef329f6e6f67b8bac3520a 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor { const framework::Tensor& src, framework::Vector index_lod, framework::Tensor& dst, bool is_src_index) { - size_t* index = index_lod.cuda_data(); auto src_dims = src.dims(); auto dst_dims = dst.dims(); PADDLE_ENFORCE_EQ(src_dims.size(), 2, @@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor { dim3 grid(8, 1); auto stream = context.stream(); CopyMatrixRowsKernel<<>>( - src_data, dst_data, index, height, width, is_src_index); + src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, + width, is_src_index); } }; diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu index 65c9cfe4a0ec14d220ad237baa71703a783ed0fa..c2bd56448aa363160f6bf621ec67deff9e369c92 100644 --- a/paddle/operators/math/sequence_padding.cu +++ b/paddle/operators/math/sequence_padding.cu @@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor { if (norm_by_times) { SequencePaddingKernel<<>>( padding_data, const_cast(seq_data), - abs_offset_lod[level].cuda_data(), sequence_width, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } else { SequencePaddingKernel<<>>( padding_data, const_cast(seq_data), - abs_offset_lod[level].cuda_data(), sequence_width, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } } @@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor { if (norm_by_times) { SequencePaddingKernel<<>>( const_cast(padding_data), seq_data, - abs_offset_lod[level].cuda_data(), sequence_width, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } else { SequencePaddingKernel<<>>( const_cast(padding_data), seq_data, - abs_offset_lod[level].cuda_data(), sequence_width, + abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, max_sequence_length, num_sequences); } } diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu index f66534a6812a66c737445ea96914a393077d7d65..c69bd3da7e741d3113de63fe08d22c23f772dda4 100644 --- a/paddle/operators/math/sequence_pooling.cu +++ b/paddle/operators/math/sequence_pooling.cu @@ -73,7 +73,8 @@ class MaxSeqPoolFunctor { dim3 grid(num_seq, 1); auto stream = context.stream(); KeMaxSequencePool<<>>( - in_data, starts.cuda_data(), out_data, max_index, num_seq, dim); + in_data, starts.CUDAData(context.GetPlace()), out_data, max_index, + num_seq, dim); } }; diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu index fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f..7cb9242db932ba8b2490f528ee08bd3f4b4e8f83 100644 --- a/paddle/operators/math/sequence_scale.cu +++ b/paddle/operators/math/sequence_scale.cu @@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor { SequenceScaleKernel<<< num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( - seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width); + seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), + scales, seq_width); } }; diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc index 89045923f9ff2f33bc112b199c493047440e15c4..edb9de82509f9ee7619f5e90f49022de977a2ea4 100644 --- a/paddle/operators/parallel_do_op.cc +++ b/paddle/operators/parallel_do_op.cc @@ -79,9 +79,6 @@ inline void CopyOrShare(const framework::Variable &src, dst->GetMutable()->set_lod(src.Get().lod()); } else { Copy(src.Get(), dst_place, dst->GetMutable()); - framework::LoD lod(src.Get().lod()); - lod.CopyToPeer(dst_place); - dst->GetMutable()->set_lod(lod); } } else if (src.IsType()) { auto &src_sr = src.Get(); @@ -92,9 +89,6 @@ inline void CopyOrShare(const framework::Variable &src, dst_sr->set_rows(src_sr.rows()); } else { Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); - framework::Vector lod(src_sr.rows()); - lod.CopyToPeer(dst_place); - dst_sr->set_rows(lod); } } else { PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); @@ -152,9 +146,6 @@ class ParallelDoOp : public framework::OperatorBase { auto *sub_scope = sub_scopes[i]; auto *dst = sub_scope->Var(param)->GetMutable(); framework::Copy(src, place, dst); - framework::LoD lod(src.lod()); - lod.CopyToPeer(place); - dst->set_lod(lod); } } WaitOnPlaces(places); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index b3825212e1ac41b13a2f4cad2c128da39c5f6e71..d1a6d119d3da605b1d455d38f38a8808234b8ad1 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -307,7 +307,7 @@ class RowConvKernel int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.cuda_data(); + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); auto stream = context.cuda_device_context().stream(); if (future_context <= 32) { @@ -345,7 +345,7 @@ class RowConvGradKernel int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.cuda_data(); + size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); auto &device_ctx = context.cuda_device_context(); math::SetConstant zero; diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu index a5311f15f0c607c880a6f12c0bef10b2dd8c8a79..4a7217cfd656f4f6b46d5a80a9c8e165c839df1d 100644 --- a/paddle/operators/sequence_erase_op.cu +++ b/paddle/operators/sequence_erase_op.cu @@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { // Copy LoD to GPU auto lod0 = lod[0]; auto lod_len = lod0.size(); - thrust::device_vector dev_in_lod = lod0; - size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data()); + const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace()); // Calc output LoD thrust::device_vector dev_out_lod(lod_len); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 29f5aa3542c26c76a1b80da61ec6752019216131..d27befe4460550f7b7b30aa93a23c8e51aa52da9 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { dim3 grid(1, in_rows.size()); SparseSGDFunctorKernel< T, 256><<>>( - in_data, in_rows.cuda_data(), learning_rate->data(), out_data, - in_row_numel); + in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), + out_data, in_row_numel); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/operators/target_assign_op.h b/paddle/operators/target_assign_op.h index 82fca5724c0bd9fbfb60a98b91944700bfab9cdf..574919e1ef8d28c2a27b73b97a91d29e89896a6b 100644 --- a/paddle/operators/target_assign_op.h +++ b/paddle/operators/target_assign_op.h @@ -137,8 +137,8 @@ class TargetAssignKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]); } - size_t* gt_lod_data = gt_lod.data(ctx.GetPlace()); - size_t* neg_lod_data = neg_lod.data(ctx.GetPlace()); + size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace()); + size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); TargetAssignFunctor functor(box_data, label_data, match_idx_data, gt_lod_data, background_label, num, diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index de53fea0dd692167d61fcca552cc834a7916e209..d62f34030894e2fa21925bbc44e24b4e7d738d15 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -7,7 +7,3 @@ if(WITH_PYTHON) target_link_libraries(paddle_pybind rt) endif(NOT APPLE AND NOT ANDROID) endif(WITH_PYTHON) - -if(WITH_DOC) - cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) -endif(WITH_DOC) diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc deleted file mode 100644 index b55ddee17616ced4de659be8e55acd5e072c66b7..0000000000000000000000000000000000000000 --- a/paddle/pybind/print_operators_doc.cc +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include // std::stringstream -#include - -#include "paddle/framework/op_info.h" -#include "paddle/framework/op_registry.h" -#include "paddle/pybind/pybind.h" - -std::string Escape(const std::string& s) { - std::string r; - for (size_t i = 0; i < s.size(); i++) { - switch (s[i]) { - case '\"': - r += "\\\""; - break; - case '\\': - r += "\\\\"; - break; - case '\n': - r += "\\n"; - break; - case '\t': - r += "\\t"; - case '\r': - break; - default: - r += s[i]; - break; - } - } - return r; -} - -std::string AttrType(paddle::framework::proto::AttrType at) { - switch (at) { - case paddle::framework::proto::INT: - return "int"; - case paddle::framework::proto::FLOAT: - return "float"; - case paddle::framework::proto::STRING: - return "string"; - case paddle::framework::proto::BOOLEAN: - return "bool"; - case paddle::framework::proto::INTS: - return "int array"; - case paddle::framework::proto::FLOATS: - return "float array"; - case paddle::framework::proto::STRINGS: - return "string array"; - case paddle::framework::proto::BOOLEANS: - return "bool array"; - case paddle::framework::proto::BLOCK: - return "block id"; - case paddle::framework::proto::LONG: - return "long"; - } - return "UNKNOWN"; // not possible -} - -void PrintVar(const paddle::framework::proto::OpProto::Var& v, - std::stringstream& ss) { - ss << " { " - << "\n" - << " \"name\" : \"" << Escape(v.name()) << "\",\n" - << " \"comment\" : \"" << Escape(v.comment()) << "\",\n" - << " \"duplicable\" : " << v.duplicable() << ",\n" - << " \"intermediate\" : " << v.intermediate() << "\n" - << " },"; -} - -void PrintAttr(const paddle::framework::proto::OpProto::Attr& a, - std::stringstream& ss) { - ss << " { " - << "\n" - << " \"name\" : \"" << Escape(a.name()) << "\",\n" - << " \"type\" : \"" << AttrType(a.type()) << "\",\n" - << " \"comment\" : \"" << Escape(a.comment()) << "\",\n" - << " \"generated\" : " << a.generated() << "\n" - << " },"; -} - -void PrintOpProto(const std::string& type, - const paddle::framework::OpInfo& opinfo, - std::stringstream& ss) { - std::cerr << "Processing " << type << "\n"; - - const paddle::framework::proto::OpProto* p = opinfo.proto_; - if (p == nullptr) { - return; // It is possible that an operator doesn't have OpProto. - } - - ss << "{\n" - << " \"type\" : \"" << Escape(p->type()) << "\",\n" - << " \"comment\" : \"" << Escape(p->comment()) << "\",\n"; - - ss << " \"inputs\" : [ " - << "\n"; - for (int i = 0; i < p->inputs_size(); i++) { - PrintVar(p->inputs(i), ss); - } - ss.seekp(-1, ss.cur); // remove the trailing comma - ss << " ], " - << "\n"; - - ss << " \"outputs\" : [ " - << "\n"; - for (int i = 0; i < p->outputs_size(); i++) { - PrintVar(p->outputs(i), ss); - } - ss.seekp(-1, ss.cur); // remove the trailing comma - ss << " ], " - << "\n"; - - ss << " \"attrs\" : [ " - << "\n"; - for (int i = 0; i < p->attrs_size(); i++) { - PrintAttr(p->attrs(i), ss); - } - ss.seekp(-1, ss.cur); // remove the trailing comma - ss << " ] " - << "\n"; - - ss << "},"; -} - -int main() { - std::stringstream ss; - ss << "[\n"; - for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) { - PrintOpProto(iter.first, iter.second, ss); - } - ss.seekp(-1, ss.cur); // remove the trailing comma - ss << "]\n"; - std::cout << ss.str(); -} diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index ba496db5f834efe767bfe446a46877932faa81a0..2f8dd48efe1d7252f5befd2f682219661a60ba03 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -118,8 +118,6 @@ EOF make -j `nproc` gen_proto_py make -j `nproc` paddle_python make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs - make -j `nproc` print_operators_doc - paddle/pybind/print_operators_doc > doc/en/html/operators.json popd fi diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 4af4ac4f5e43543449ae922d7eb2a5740372f68f..486c094a6ac3c9ec260e231ba2333c718dfc164b 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -10,8 +10,6 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make -j `nproc` gen_proto_py make -j `nproc` paddle_python make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs -make -j `nproc` print_operators_doc -paddle/pybind/print_operators_doc > doc/en/html/operators.json # check websites for broken links linkchecker doc/en/html/index.html diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index fd8c4a69da897cc39f31f435036e32c41285fb59..ab84f1c292b97f55d88165e7ef0e32b93d542802 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/memory/memory.h" int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); std::vector new_argv; std::string gflags_env; for (int i = 0; i < argc; ++i) { @@ -35,7 +36,6 @@ int main(int argc, char** argv) { int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); - testing::InitGoogleTest(&argc, argv); paddle::memory::Used(paddle::platform::CPUPlace()); #ifdef PADDLE_WITH_CUDA diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 3ee58393c72c0b6f9bec96be51ad3946752a35dd..73acbf3e009965f9eaaade77d2fe4cf4f99d4379 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -29,7 +29,7 @@ import optimizer import learning_rate_decay import backward import regularizer -from param_attr import ParamAttr +from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, CUDAPlace from distribute_transpiler import DistributeTranspiler @@ -41,11 +41,26 @@ import profiler Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ - 'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay', - 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', - 'ParamAttr' - 'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler', - 'memory_optimize', 'profiler' + 'io', + 'initializer', + 'layers', + 'nets', + 'optimizer', + 'learning_rate_decay', + 'backward', + 'regularizer', + 'LoDTensor', + 'CPUPlace', + 'CUDAPlace', + 'Tensor', + 'ParamAttr', + 'WeightNormParamAttr', + 'DataFeeder', + 'clip', + 'SimpleDistributeTranspiler', + 'DistributeTranspiler', + 'memory_optimize', + 'profiler', ] diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index 121b407cae41fa477843b7252ebacc9053d5f7aa..cd89dba72db2470c1d5bafd4523c21c49e7e2b53 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -300,6 +300,9 @@ class DistributeTranspiler: pass return orig_shape + def _op_input_var(self, op, varname): + pass + def _is_op_on_pserver(self, endpoint, all_ops, idx): """ Recursively check if the op need to run on current server. @@ -309,44 +312,51 @@ class DistributeTranspiler: p.name for p in self.param_grad_ep_mapping[endpoint]["params"] ] op = all_ops[idx] - if op.inputs.has_key("Param"): - if op.inputs["Param"].name in param_names: + input_names = set(op.input_names) + # TODO(typhoonzero): using Param and Grad input name to identify + # that the operator is an optimization operator, need a better way. + if "Param" in input_names: + if op.input("Param")[0] in param_names: return True else: for n in param_names: - if same_or_split_var(n, op.inputs[ - "Param"].name) and n != op.inputs["Param"].name: + if same_or_split_var(n, op.input("Param")[0]) \ + and n != op.input("Param")[0]: return True return False else: j = idx - 1 while j >= 0: prev_op = all_ops[j] - prev_output_names = [o.name for o in prev_op.outputs.values()] - prev_input_names = [o.name for o in prev_op.inputs.values()] + # prev_output_names = [o.name for o in prev_op.outputs.values()] + # prev_input_names = [o.name for o in prev_op.inputs.values()] + # NOTE(typhoonzero): consider list input/output + prev_output_names = prev_op.desc.output_arg_names() + prev_input_names = prev_op.desc.input_arg_names() found1 = False found2 = False - for _, v in op.inputs.iteritems(): - if v.name in prev_output_names: + for varname in op.desc.input_arg_names(): + if varname in prev_output_names: found1 = self._is_op_on_pserver(endpoint, all_ops, j) # later ops may produce output for prev op's next batch use. - for _, v in op.outputs.iteritems(): - if v.name in prev_input_names: + for varname in op.desc.output_arg_names(): + if varname in prev_input_names: found2 = self._is_op_on_pserver(endpoint, all_ops, j) if found1 or found2: return True j -= 1 return False - def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint): + def _append_pserver_ops(self, optimize_block, opt_op, endpoint): + program = optimize_block.program new_inputs = dict() # update param/grad shape first, then other inputs like # moment can use the updated shape - for key, var in opt_op.inputs.iteritems(): + for key in opt_op.input_names: if key == "Grad": grad_block = None for g in self.param_grad_ep_mapping[endpoint]["grads"]: - if same_or_split_var(g.name, var.name): + if same_or_split_var(g.name, opt_op.input(key)[0]): grad_block = g break if not grad_block: @@ -362,11 +372,11 @@ class DistributeTranspiler: if self.trainers > 1: vars2merge = self._create_var_for_trainers( program.global_block(), grad_block, self.trainers) - program.global_block().append_op( + optimize_block.append_op( type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}) - program.global_block().append_op( + optimize_block.append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, @@ -376,7 +386,7 @@ class DistributeTranspiler: # param is already created on global program param_block = None for p in self.param_grad_ep_mapping[endpoint]["params"]: - if same_or_split_var(p.name, var.name): + if same_or_split_var(p.name, opt_op.input(key)[0]): param_block = p break if not param_block: @@ -389,11 +399,12 @@ class DistributeTranspiler: new_inputs[key] = tmpvar - for key, var in opt_op.inputs.iteritems(): + for key in opt_op.input_names: if key in ["Param", "Grad"]: continue # update accumulator variable shape param_shape = new_inputs["Param"].shape + var = program.global_block().vars[opt_op.input(key)[0]] new_shape = self._get_optimizer_input_shape(opt_op.type, key, var.shape, param_shape) tmpvar = program.global_block().create_var( @@ -402,40 +413,41 @@ class DistributeTranspiler: dtype=var.dtype, shape=new_shape) new_inputs[key] = tmpvar - # create var in pserver program global block. - # TODO(typhoonzero): put blocks in one program to avoid create two - # variables. - pserver_program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=new_shape) # change output's ParamOut variable - opt_op.outputs["ParamOut"] = new_inputs["Param"] - program.global_block().append_op( + outputs = self._get_output_map_from_op(program.global_block(), opt_op) + outputs["ParamOut"] = new_inputs["Param"] + optimize_block.append_op( type=opt_op.type, inputs=new_inputs, - outputs=opt_op.outputs, + outputs=outputs, attrs=opt_op.attrs) - def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op): + def _append_pserver_non_opt_ops(self, optimize_block, opt_op): + program = optimize_block.program # Append the ops for parameters that do not need to be optimized/updated - for _, var in opt_op.inputs.iteritems(): - program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - pserver_program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - program.global_block().append_op( + inputs = self._get_input_map_from_op(self.program.global_block().vars, + opt_op) + for var in inputs.itervalues(): + if type(var) == list: + varlist = var + else: + varlist = [var] + for var in varlist: + if not program.global_block().vars.has_key(var.name): + program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + + outputs = self._get_output_map_from_op(self.program.global_block().vars, + opt_op) + + optimize_block.append_op( type=opt_op.type, - inputs=opt_op.inputs, - outputs=opt_op.outputs, + inputs=inputs, + outputs=outputs, attrs=opt_op.attrs) def get_pserver_program(self, endpoint): @@ -465,26 +477,25 @@ class DistributeTranspiler: dtype=v.dtype, shape=v.shape) # step6 - optimize_sub_program = Program() + optimize_block = pserver_program.create_block(0) # Iterate through the ops and append ops as needed for idx, opt_op in enumerate(self.optimize_ops): is_op_on_pserver = self._is_op_on_pserver(endpoint, self.optimize_ops, idx) if not is_op_on_pserver: continue - if opt_op.inputs.has_key("Grad"): - self._append_pserver_ops(optimize_sub_program, pserver_program, - opt_op, endpoint) + if "Grad" in opt_op.desc.input_arg_names(): + self._append_pserver_ops(optimize_block, opt_op, endpoint) else: - self._append_pserver_non_opt_ops(optimize_sub_program, - pserver_program, opt_op) + self._append_pserver_non_opt_ops(optimize_block, opt_op) + # Append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", inputs={}, outputs={}, attrs={ - "OptimizeBlock": optimize_sub_program.global_block(), + "OptimizeBlock": optimize_block, "endpoint": endpoint, "ParamList": [ p.name @@ -499,6 +510,30 @@ class DistributeTranspiler: pserver_program.sync_with_cpp() return pserver_program + def _get_input_map_from_op(self, varmap, op): + iomap = dict() + for key in op.input_names: + vars = [] + for varname in op.input(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + def _get_output_map_from_op(self, varmap, op): + iomap = dict() + for key in op.output_names: + vars = [] + for varname in op.output(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + def get_startup_program(self, endpoint, pserver_program): """ Get startup program for current parameter server. @@ -529,17 +564,21 @@ class DistributeTranspiler: # 2. rename op outputs for op in orig_s_prog.global_block().ops: + new_inputs = dict() new_outputs = dict() # do not append startup op if var is not on this pserver op_on_pserver = False - for key, var in op.outputs.iteritems(): - newname, _ = _get_splited_name_and_shape(var.name) + for key in op.output_names: + newname, _ = _get_splited_name_and_shape(op.output(key)[0]) if newname: op_on_pserver = True new_outputs[key] = created_var_map[newname] - elif var.name in pserver_vars: + elif op.output(key)[0] in pserver_vars: op_on_pserver = True - new_outputs[key] = pserver_vars[var.name] + new_outputs[key] = pserver_vars[op.output(key)[0]] + + # most startup program ops have no inputs + new_inputs = self._get_input_map_from_op(pserver_vars, op) if op_on_pserver: if op.type in [ @@ -548,7 +587,7 @@ class DistributeTranspiler: op.attrs["shape"] = new_outputs["Out"].shape s_prog.global_block().append_op( type=op.type, - inputs=op.inputs, + inputs=new_inputs, outputs=new_outputs, attrs=op.attrs) return s_prog diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 0eddcc3a5ab6f71aa5500c3b98b63c0937c7ddfc..01cbdb3ec487d6e2e60890619131de0067d40db9 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -47,27 +47,13 @@ def as_numpy(tensor): return [as_numpy(t) for t in tensor] assert isinstance(tensor, core.LoDTensor) lod = tensor.lod() - tensor_data = np.array(tensor) - if len(lod) == 0: - ans = tensor_data - else: - raise RuntimeError("LoD Calculate lacks unit tests and buggy") - # elif len(lod) == 1: - # ans = [] - # idx = 0 - # while idx < len(lod) - 1: - # ans.append(tensor_data[lod[idx]:lod[idx + 1]]) - # idx += 1 - # else: - # for l in reversed(lod): - # ans = [] - # idx = 0 - # while idx < len(l) - 1: - # ans.append(tensor_data[l[idx]:l[idx + 1]]) - # idx += 1 - # tensor_data = ans - # ans = tensor_data - return ans + if len(lod) > 0: + raise RuntimeError( + "Some of your featched tensors hold LoD information. \ + They can not be completely cast to Python ndarray. \ + Please set the parameter 'return_numpy' as 'False' to \ + return LoDTensor itself directly.") + return np.array(tensor) def has_feed_operators(block, feed_targets, feed_holder_name): @@ -306,7 +292,6 @@ class Executor(object): core.get_fetch_variable(scope, fetch_var_name, i) for i in xrange(len(fetch_list)) ] - if return_numpy: outs = as_numpy(outs) return outs diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index a12427258e9d3142abcb84249a10dabd8e96b792..a517db68c5886fbcbe19e6981aee5bf3971352e4 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -740,6 +740,9 @@ class Block(object): raise e self.desc.remove_op(start, end + 1) + def slice_ops(self, start, end): + return list(self.ops)[start:end] + def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() op = Operator(self, op_desc, *args, **kwargs) diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py index b9c0d12ad6cf09e66df6b1a8da09df275c79a3f6..8c70fd90eff6d40c77ea4b74f5b04d6ab62401be 100644 --- a/python/paddle/v2/fluid/initializer.py +++ b/python/paddle/v2/fluid/initializer.py @@ -14,14 +14,37 @@ import framework import numpy as np +import contextlib __all__ = [ - 'Constant', - 'Uniform', - 'Normal', - 'Xavier', + 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu', + 'init_on_cpu' ] +_force_init_on_cpu_ = False + + +def force_init_on_cpu(): + return _force_init_on_cpu_ + + +@contextlib.contextmanager +def init_on_cpu(): + """ + Switch program with `with` statement + + Examples: + >>> with init_on_cpu(): + >>> step = layers.create_global_var() + + """ + global _force_init_on_cpu_ + + pre_state = force_init_on_cpu() + _force_init_on_cpu_ = True + yield + _force_init_on_cpu_ = pre_state + class Initializer(object): """Base class for variable initializers @@ -80,7 +103,7 @@ class ConstantInitializer(Initializer): """Implements the constant initializer """ - def __init__(self, value=0.0): + def __init__(self, value=0.0, force_cpu=False): """Constructor for ConstantInitializer Args: @@ -89,6 +112,7 @@ class ConstantInitializer(Initializer): assert value is not None super(ConstantInitializer, self).__init__() self._value = value + self._force_cpu = force_cpu def __call__(self, var, block): """Add constant initialization ops for a variable @@ -110,7 +134,8 @@ class ConstantInitializer(Initializer): attrs={ "shape": var.shape, "dtype": int(var.dtype), - "value": self._value + "value": float(self._value), + 'force_cpu': self._force_cpu or force_init_on_cpu() }) var.op = op return op diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index 613dc20b6ea5533d126a73b7ec47796b3f812db5..0f43e46082a8988be4805a2b750227312ba80ff3 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -342,7 +342,11 @@ def save_inference_model(dirname, prepend_feed_ops(inference_program, feeded_var_names) append_fetch_ops(inference_program, fetch_var_names) - model_file_name = dirname + "/__model__" + if save_file_name == None: + model_file_name = dirname + "/__model__" + else: + model_file_name = dirname + "/__model_combined__" + with open(model_file_name, "wb") as f: f.write(inference_program.desc.serialize_to_string()) @@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None): if not os.path.isdir(dirname): raise ValueError("There is no directory named '%s'", dirname) - model_file_name = dirname + "/__model__" + if load_file_name == None: + model_file_name = dirname + "/__model__" + else: + model_file_name = dirname + "/__model_combined__" + with open(model_file_name, "rb") as f: program_desc_str = f.read() diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py index 79a130a3eb148e6c5a8fa3cdf174780b354c23c9..9b5f22759cf5ccba841919ec158ff58e6505dcf7 100644 --- a/python/paddle/v2/fluid/layers/math_op_patch.py +++ b/python/paddle/v2/fluid/layers/math_op_patch.py @@ -14,6 +14,7 @@ from ..framework import Variable, unique_name from layer_function_generator import OpProtoHolder +from ..initializer import force_init_on_cpu __all__ = ['monkey_patch_variable'] @@ -36,9 +37,12 @@ def monkey_patch_variable(): block.append_op( type="fill_constant", outputs={'Out': [var]}, - attrs={'dtype': var.dtype, - 'shape': shape, - 'value': value}) + attrs={ + 'dtype': var.dtype, + 'shape': shape, + 'value': value, + 'force_cpu': force_init_on_cpu() + }) return var def create_scalar(block, value, dtype): diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 99168ecc228045a0206aff1b7de5fc17c1438fe2..5ebd329fc0285a39111a23b3c58c80944cfe23f6 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -65,6 +65,7 @@ __all__ = [ 'beam_search', 'row_conv', 'multiplex', + 'layer_norm', ] @@ -184,7 +185,7 @@ def fc(input, helper.append_op( type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) # add bias - pre_activation = helper.append_bias_op(pre_bias) + pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) # add activation return helper.append_activation(pre_activation) @@ -641,8 +642,8 @@ def dynamic_gru(input, Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". Returns: - Variable: The hidden state of GRU. The shape is (T \\times D), and lod \ - is the same with the input. + Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \ + and lod is the same with the input. Examples: .. code-block:: python @@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs): label(Variable): Label tensor, has target labels. Returns: - Variable: The tensor variable storing the element-wise squared error + Variable: The tensor variable storing the element-wise squared error \ difference of input and label. Examples: @@ -1214,7 +1215,7 @@ def conv2d(input, act(str): Activation type. Default: None Returns: - Variable: The tensor variable storing the convolution and + Variable: The tensor variable storing the convolution and \ non-linearity activation result. Raises: @@ -1565,6 +1566,102 @@ def batch_norm(input, return helper.append_activation(batch_norm_out) +def layer_norm(input, + scale=True, + shift=True, + begin_norm_axis=1, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + act=None, + name=None): + """ + **Layer Normalization** + + Assume feature vectors exist on dimensions + :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics + along these dimensions for each feature vector :math:`a` with size + :math:`H`, then normalize each feature vector using the corresponding + statistics. After that, apply learnable gain and bias on the normalized + tensor to scale and shift if :attr:`scale` and :attr:`shift` are set. + + Refer to `Layer Normalization `_ + + The formula is as follows: + + .. math:: + + \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i + + \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2} + + h & = f(\\frac{g}{\\sigma}(a - \\mu) + b) + + Args: + input(Variable): The input tensor variable. + scale(bool): Whether to learn the adaptive gain :math:`g` after + normalization. + shift(bool): Whether to learn the adaptive bias :math:`b` after + normalization. + begin_norm_axis(bool): The normalization will be performed along + dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. + epsilon(float): The small value added to the variance to prevent + division by zero. + param_attr(ParamAttr|None): The parameter attribute for the learnable + gain :math:`g`. + bias_attr(ParamAttr|None): The parameter attribute for the learnable + bias :math:`b`. + act(str): Activation to be applied to the output of layer normalizaiton. + + Returns: + Variable: A tensor variable with the same shape as the input. + + Examples: + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) + """ + helper = LayerHelper('layer_norm', **locals()) + dtype = helper.input_dtype() + + # create intput and parameters + inputs = {'X': input} + input_shape = input.shape + param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])] + if scale: + scale = helper.create_parameter( + attr=helper.param_attr, + shape=param_shape, + dtype=dtype, + default_initializer=Constant(1.0)) + inputs['Scale'] = scale + if shift: + assert bias_attr is not False + bias = helper.create_parameter( + attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) + inputs['Bias'] = bias + + # create output + mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) + variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) + layer_norm_out = helper.create_tmp_variable(dtype) + + helper.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": layer_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={"epsilon": epsilon, + "begin_norm_axis": begin_norm_axis}) + + return helper.append_activation(layer_norm_out) + + def beam_search_decode(ids, scores, name=None): helper = LayerHelper('beam_search_decode', **locals()) sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py index 38dea2892fc18a9c493878d816a246522e9b9886..bb3f71abbb00793a32b8c288ee3ea255cc9c584d 100644 --- a/python/paddle/v2/fluid/layers/ops.py +++ b/python/paddle/v2/fluid/layers/ops.py @@ -65,6 +65,8 @@ __all__ = [ 'logical_or', 'logical_xor', 'logical_not', + 'uniform_random', + 'cumsum', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index 704e040b9f478ef61991cfbe175f1cdeaf102763..2d4e0ab0cc66861d8bd31e196740688ab8fa6262 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper from ..param_attr import ParamAttr from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable -from ..initializer import Constant +from ..initializer import Constant, force_init_on_cpu from ..core import DataType import numpy @@ -69,12 +69,30 @@ def create_parameter(shape, default_initializer) -def create_global_var(shape, value, dtype, persistable=False, name=None): +def create_global_var(shape, + value, + dtype, + persistable=False, + force_cpu=False, + name=None): + """ + Create a global variable. such as global_step + Args: + shape(list[int]): shape of the variable + value(float): the value of the variable + dtype(string): element type of the parameter + persistable(bool): if this variable is persistable + force_cpu(bool): force this variable to be on CPU + + Returns: + Variable: the created Variable + """ helper = LayerHelper("global_var", **locals()) var = helper.create_global_variable( dtype=dtype, shape=shape, persistable=persistable, name=name) helper.set_variable_initializer( - var, initializer=Constant(value=float(value))) + var, initializer=Constant( + value=float(value), force_cpu=force_cpu)) return var @@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): dtype(np.dtype|core.DataType|str): Data type of the output tensor. value(float): The constant value used to initialize the output tensor. out(Variable): The output tensor. + force_cpu(True|False): data should be on CPU if set true. Returns: Variable: The tensor variable storing the output. @@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): 'shape': shape, 'dtype': out.dtype, 'value': float(value), - 'force_cpu': force_cpu + 'force_cpu': force_cpu or force_init_on_cpu() }) out.stop_gradient = True return out diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py index 13dc98075f7d32f9dda56a890b98451ef81af363..2a2a29fd9cbedc138dc82ca75ccd78208fd33195 100644 --- a/python/paddle/v2/fluid/learning_rate_decay.py +++ b/python/paddle/v2/fluid/learning_rate_decay.py @@ -14,6 +14,7 @@ import layers from framework import Variable +from initializer import init_on_cpu __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', @@ -54,11 +55,14 @@ def exponential_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for exponential_decay.") - # update learning_rate - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) - return learning_rate * (decay_rate**div_res) + with init_on_cpu(): + # update learning_rate + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + decayed_lr = learning_rate * (decay_rate**div_res) + + return decayed_lr def natural_exp_decay(learning_rate, @@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for natural_exp_decay.") - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) - return learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) + with init_on_cpu(): + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) + + return decayed_lr def inverse_time_decay(learning_rate, @@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for inverse_time_decay.") - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) + with init_on_cpu(): + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + + decayed_lr = learning_rate / (1 + decay_rate * div_res) - return learning_rate / (1 + decay_rate * div_res) + return decayed_lr def polynomial_decay(learning_rate, @@ -160,22 +170,27 @@ def polynomial_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for inverse_time_decay.") - if cycle: - div_res = layers.ceil(x=(global_step / decay_steps)) - zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) - one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) - - with layers.Switch() as switch: - with switch.case(layers.equal(x=global_step, y=zero_var)): - layers.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res - else: - decay_steps_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = layers.elementwise_min(x=global_step, y=decay_steps_var) - - return (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate + with init_on_cpu(): + if cycle: + div_res = layers.ceil(x=(global_step / decay_steps)) + zero_var = layers.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = layers.fill_constant( + shape=[1], dtype='float32', value=1.0) + + with layers.Switch() as switch: + with switch.case(layers.equal(x=global_step, y=zero_var)): + layers.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = layers.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = layers.elementwise_min( + x=global_step, y=decay_steps_var) + + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr def piecewise_decay(global_step, boundaries, values): @@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values): if not isinstance(global_step, Variable): raise ValueError("global_step is required for piecewise_decay.") - lr = layers.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") - - with layers.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = layers.fill_constant( - shape=[1], dtype='float32', value=float(boundaries[i])) - value_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(layers.less_than(global_step, boundary_val)): - layers.assign(value_var, lr) - last_value_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(values[len(values) - 1])) - with switch.default(): - layers.assign(last_value_var, lr) + with init_on_cpu(): + lr = layers.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + with layers.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = layers.fill_constant( + shape=[1], dtype='float32', value=float(boundaries[i])) + value_var = layers.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(layers.less_than(global_step, boundary_val)): + layers.assign(value_var, lr) + last_value_var = layers.fill_constant( + shape=[1], + dtype='float32', + value=float(values[len(values) - 1])) + with switch.default(): + layers.assign(last_value_var, lr) return lr diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py index 8bb8cf7b1a5ddf44427637229bdc31ac0e151e44..53e0991ee8c318e0c95018b57ad48f404ce8beae 100644 --- a/python/paddle/v2/fluid/memory_optimization_transpiler.py +++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py @@ -92,14 +92,13 @@ class ControlFlowGraph(object): live_in = defaultdict(set) live_out = defaultdict(set) while True: - for i in range(self.op_size): + for i in range(self.op_size, 0, -1): live_in[i] = set(self._live_in[i]) live_out[i] = set(self._live_out[i]) - self._live_in[i] = self._uses[i] | ( - self._live_out[i] - self._defs[i]) for s in self._successors[i]: self._live_out[i] |= self._live_in[s] - + self._live_in[i] = self._uses[i] | ( + self._live_out[i] - self._defs[i]) if self._reach_fixed_point(live_in, live_out): break diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c..be7878f869b509fa1117e305aee662cc0123bbcc 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries, Returns: - Variable: A 3-D Tensor computed by multi-head scaled dot product + Variable: A 3-D Tensor computed by multi-head scaled dot product \ attention. Raises: @@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries, x=product, shape=[-1, product.shape[-1]], act="softmax"), shape=product.shape) if dropout_rate: - weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False) + weights = layers.dropout( + weights, dropout_prob=dropout_rate, is_test=False) ctx_multiheads = layers.matmul(weights, v) return __combine_heads(ctx_multiheads) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 7844a4e2df1ce3989e48082f6472292560fbf1ee..f8a00e3a5fb4038a97a951a01c3a2f1a4488ae75 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -190,6 +190,8 @@ class Optimizer(object): # Create any accumulators program = loss.block.program with program_guard(program, startup_program): + global_block = framework.default_main_program().global_block() + start = len(global_block.ops) self.helper = LayerHelper(self.__class__.__name__) self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) @@ -203,19 +205,14 @@ class Optimizer(object): param_and_grad) optimize_ops.append(optimize_op) - # Returned list of ops can include more ops in addition - # to optimization ops - return_ops = optimize_ops - # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - finish_ops = self._finish_update(loss.block) - if finish_ops is not None: - return_ops += finish_ops + self._finish_update(loss.block) if self._global_step is not None: - return_ops.append(self._increment_global_step(loss.block)) - return return_ops + self._increment_global_step(loss.block) + end = len(global_block.ops) + return global_block.slice_ops(start, end) def minimize(self, loss, diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore index f0b574b9396706a1d68393482296360362dca750..dd28d354f4160b4be68b46a7bebcdf2097d5811a 100644 --- a/python/paddle/v2/fluid/tests/book/.gitignore +++ b/python/paddle/v2/fluid/tests/book/.gitignore @@ -1 +1 @@ -recognize_digits_*.inference.model +*.inference.model diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 06860a2a465c6f8590336670372eb6ff43b10594..b3332b4810be3c95a5a48ebde92af89cafd94326 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -15,15 +15,13 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import contextlib +import numpy import unittest import math import sys -def main(use_cuda): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - +def train(use_cuda, save_dirname): x = fluid.layers.data(name='x', shape=[13], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) @@ -51,14 +49,15 @@ def main(use_cuda): PASS_NUM = 100 for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): avg_loss_value, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost]) print(avg_loss_value) if avg_loss_value[0] < 10.0: + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, ['x'], + [y_predict], exe) return if math.isnan(float(avg_loss_value)): sys.exit("got NaN loss, training failed.") @@ -66,6 +65,43 @@ def main(use_cuda): avg_loss_value[0])) +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + # The input's dimension should be 2-D and the second dim is 13 + # The input data should be >= 0 + batch_size = 10 + tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") + assert feed_target_names[0] == 'x' + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_x}, + fetch_list=fetch_targets) + print("infer shape: ", results[0].shape) + print("infer results: ", results[0]) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the trained model + save_dirname = "fit_a_line.inference.model" + + train(use_cuda, save_dirname) + infer(use_cuda, save_dirname) + + class TestFitALine(unittest.TestCase): def test_cpu(self): with self.program_scope_guard(): diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 1491f7a8d5496445f8300d3db1d367bb3167d2c7..f33e81186bd8ad74b4d569383bce34afb2c40c24 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -18,6 +18,7 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 import paddle.v2.fluid as fluid +from paddle.v2.fluid.initializer import init_on_cpu import contextlib import time import unittest @@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None): # TODO(qiao) # check other optimizers and check why out will be NAN - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + global_step = fluid.layers.create_global_var( + shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True) + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.learning_rate_decay.exponential_decay( + learning_rate=0.0001, + global_step=global_step, + decay_steps=100000, + decay_rate=0.5, + staircase=True), + global_step=global_step) sgd_optimizer.minimize(avg_cost) # TODO(qiao) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py index d8f0ad89cd89215ac83a133bd27a53c4b904363f..244c1749cd522faec26f8cf8e71f7469843f534e 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -78,7 +78,7 @@ def conv_net(img, label): return loss_net(conv_pool_2, label) -def train(nn_type, use_cuda, parallel, save_dirname): +def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename): if use_cuda and not fluid.core.is_compiled_with_cuda(): return img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') @@ -143,8 +143,10 @@ def train(nn_type, use_cuda, parallel, save_dirname): avg_loss_val = numpy.array(avg_loss_set).mean() if float(acc_val) > 0.85: # test acc > 85% if save_dirname is not None: - fluid.io.save_inference_model(save_dirname, ["img"], - [prediction], exe) + fluid.io.save_inference_model( + save_dirname, ["img"], [prediction], + exe, + save_file_name=save_param_filename) return else: print( @@ -156,7 +158,7 @@ def train(nn_type, use_cuda, parallel, save_dirname): raise AssertionError("Loss of recognize digits is too large") -def infer(use_cuda, save_dirname=None): +def infer(use_cuda, save_dirname=None, param_filename=None): if save_dirname is None: return @@ -167,13 +169,14 @@ def infer(use_cuda, save_dirname=None): # the feed_target_names (the names of variables that will be feeded # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + [inference_program, feed_target_names, fetch_targets + ] = fluid.io.load_inference_model(save_dirname, exe, param_filename) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0]. + batch_size = 1 tensor_img = numpy.random.uniform(-1.0, 1.0, - [1, 1, 28, 28]).astype("float32") + [batch_size, 1, 28, 28]).astype("float32") # Construct feed as a dictionary of {feed_target_name: feed_target_data} # and results will contain a list of data corresponding to fetch_targets. @@ -183,36 +186,45 @@ def infer(use_cuda, save_dirname=None): print("infer results: ", results[0]) -def main(use_cuda, parallel, nn_type): +def main(use_cuda, parallel, nn_type, combine): if not use_cuda and not parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model" + save_filename = None + if combine == True: + save_filename = "__params_combined__" else: save_dirname = None + save_filename = None train( nn_type=nn_type, use_cuda=use_cuda, parallel=parallel, - save_dirname=save_dirname) - infer(use_cuda=use_cuda, save_dirname=save_dirname) + save_dirname=save_dirname, + save_param_filename=save_filename) + infer( + use_cuda=use_cuda, + save_dirname=save_dirname, + param_filename=save_filename) class TestRecognizeDigits(unittest.TestCase): pass -def inject_test_method(use_cuda, parallel, nn_type): +def inject_test_method(use_cuda, parallel, nn_type, combine): def __impl__(self): prog = fluid.Program() startup_prog = fluid.Program() scope = fluid.core.Scope() with fluid.scope_guard(scope): with fluid.program_guard(prog, startup_prog): - main(use_cuda, parallel, nn_type) + main(use_cuda, parallel, nn_type, combine) - fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda' - if use_cuda else 'cpu', 'parallel' - if parallel else 'normal') + fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda' + if use_cuda else 'cpu', 'parallel' + if parallel else 'normal', 'combine' + if combine else 'separate') setattr(TestRecognizeDigits, fn, __impl__) @@ -221,7 +233,10 @@ def inject_all_tests(): for use_cuda in (False, True): for parallel in (False, True): for nn_type in ('mlp', 'conv'): - inject_test_method(use_cuda, parallel, nn_type) + inject_test_method(use_cuda, parallel, nn_type, True) + + # One unit-test for saving parameters as separate files + inject_test_method(False, False, 'mlp', False) inject_all_tests() diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index 9c7ab7d6318472ac9378dd1966b75d19b5505bf5..612d51e08e4fc05b397df9d8aaaf675ba9d783af 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -16,7 +16,7 @@ import math import sys import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid.core as core +import paddle.v2.fluid as fluid import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets @@ -104,7 +104,8 @@ def get_mov_combined_features(): CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) - category_id = layers.data(name='category_id', shape=[1], dtype='int64') + category_id = layers.data( + name='category_id', shape=[1], dtype='int64', lod_level=1) mov_categories_emb = layers.embedding( input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) @@ -114,7 +115,8 @@ def get_mov_combined_features(): MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) - mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64') + mov_title_id = layers.data( + name='movie_title', shape=[1], dtype='int64', lod_level=1) mov_title_emb = layers.embedding( input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) @@ -144,23 +146,22 @@ def model(): scale_infer = layers.scale(x=inference, scale=5.0) label = layers.data(name='score', shape=[1], dtype='float32') - square_cost = layers.square_error_cost(input=scale_infer, label=label) - avg_cost = layers.mean(x=square_cost) - return avg_cost + return scale_infer, avg_cost + +def train(use_cuda, save_dirname): + scale_infer, avg_cost = model() + + # test program + test_program = fluid.default_main_program().clone() -def main(): - cost = model() sgd_optimizer = SGDOptimizer(learning_rate=0.2) - opts = sgd_optimizer.minimize(cost) + opts = sgd_optimizer.minimize(avg_cost) - if USE_GPU: - place = core.CUDAPlace(0) - else: - place = core.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) @@ -169,6 +170,8 @@ def main(): paddle.reader.shuffle( paddle.dataset.movielens.train(), buf_size=8192), batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) feeding = { 'user_id': 0, @@ -184,7 +187,7 @@ def main(): def func_feed(feeding, data): feed_tensors = {} for (key, idx) in feeding.iteritems(): - tensor = core.LoDTensor() + tensor = fluid.LoDTensor() if key != "category_id" and key != "movie_title": if key == "score": numpy_data = np.array(map(lambda x: x[idx], data)).astype( @@ -211,16 +214,117 @@ def main(): PASS_NUM = 100 for pass_id in range(PASS_NUM): - for data in train_reader(): - outs = exe.run(framework.default_main_program(), + for batch_id, data in enumerate(train_reader()): + # train a mini-batch + outs = exe.run(program=fluid.default_main_program(), feed=func_feed(feeding, data), - fetch_list=[cost]) + fetch_list=[avg_cost]) out = np.array(outs[0]) - if out[0] < 6.0: - # if avg cost less than 6.0, we think our code is good. - exit(0) + if (batch_id + 1) % 10 == 0: + avg_cost_set = [] + for test_data in test_reader(): + avg_cost_np = exe.run(program=test_program, + feed=func_feed(feeding, test_data), + fetch_list=[avg_cost]) + avg_cost_set.append(avg_cost_np[0]) + break # test only 1 segment for speeding up CI + + # get test avg_cost + test_avg_cost = np.array(avg_cost_set).mean() + if test_avg_cost < 6.0: + # if avg_cost less than 6.0, we think our code is good. + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, [ + "user_id", "gender_id", "age_id", "job_id", + "movie_id", "category_id", "movie_title" + ], [scale_infer], exe) + return + if math.isnan(float(out[0])): sys.exit("got NaN loss, training failed.") -main() +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + def create_lod_tensor(data, lod=None): + tensor = fluid.LoDTensor() + if lod is None: + # Tensor, the shape is [batch_size, 1] + index = 0 + lod_0 = [index] + for l in range(len(data)): + index += 1 + lod_0.append(index) + lod = [lod_0] + tensor.set_lod(lod) + + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + tensor.set(flattened_data, place) + return tensor + + # Use the first data from paddle.dataset.movielens.test() as input + assert feed_target_names[0] == "user_id" + user_id = create_lod_tensor([[1]]) + + assert feed_target_names[1] == "gender_id" + gender_id = create_lod_tensor([[1]]) + + assert feed_target_names[2] == "age_id" + age_id = create_lod_tensor([[0]]) + + assert feed_target_names[3] == "job_id" + job_id = create_lod_tensor([[10]]) + + assert feed_target_names[4] == "movie_id" + movie_id = create_lod_tensor([[783]]) + + assert feed_target_names[5] == "category_id" + category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]]) + + assert feed_target_names[6] == "movie_title" + movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]], + [[0, 5]]) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + results = exe.run(inference_program, + feed={ + feed_target_names[0]: user_id, + feed_target_names[1]: gender_id, + feed_target_names[2]: age_id, + feed_target_names[3]: job_id, + feed_target_names[4]: movie_id, + feed_target_names[5]: category_id, + feed_target_names[6]: movie_title + }, + fetch_list=fetch_targets, + return_numpy=False) + print("inferred score: ", np.array(results[0])) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the inference model + save_dirname = "recommender_system.inference.model" + + train(use_cuda, save_dirname) + infer(use_cuda, save_dirname) + + +if __name__ == '__main__': + main(USE_GPU) diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py index fdc60861760163d2ebad3b050e551929321baafd..7fe43c680ca9319682c42836986308856185a464 100644 --- a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py +++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py @@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers +import contextlib +import math +import sys +import unittest from paddle.v2.fluid.executor import Executor dict_size = 30000 @@ -145,7 +149,7 @@ def seq_to_seq_net(): cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) - return avg_cost + return avg_cost, prediction def to_lodtensor(data, place): @@ -163,8 +167,16 @@ def to_lodtensor(data, place): return res -def main(): - avg_cost = seq_to_seq_net() +def create_random_lodtensor(lod, place, low, high): + data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") + res = fluid.LoDTensor() + res.set(data, place) + res.set_lod([lod]) + return res + + +def train(use_cuda, save_dirname=None): + [avg_cost, prediction] = seq_to_seq_net() optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) @@ -174,7 +186,7 @@ def main(): paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) - place = core.CPUPlace() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) @@ -185,6 +197,7 @@ def main(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(framework.default_main_program(), feed={ 'source_sequence': word_data, @@ -192,13 +205,86 @@ def main(): 'label_sequence': trg_word_next }, fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) + if math.isnan(float(avg_cost_val[0])): + sys.exit("got NaN loss, training failed.") if batch_id > 3: - exit(0) + if save_dirname is not None: + fluid.io.save_inference_model( + save_dirname, ['source_sequence', + 'target_sequence'], [prediction], exe) + return + batch_id += 1 +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + lod = [0, 4, 10] + word_data = create_random_lodtensor(lod, place, low=0, high=1) + trg_word = create_random_lodtensor(lod, place, low=0, high=1) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + assert feed_target_names[0] == 'source_sequence' + assert feed_target_names[1] == 'target_sequence' + results = exe.run(inference_program, + feed={ + feed_target_names[0]: word_data, + feed_target_names[1]: trg_word, + }, + fetch_list=fetch_targets, + return_numpy=False) + print(results[0].lod()) + np_data = np.array(results[0]) + print("Inference shape: ", np_data.shape) + print("Inference results: ", np_data) + + +def main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the trained model + save_dirname = "rnn_encoder_decoder.inference.model" + + train(use_cuda, save_dirname) + infer(use_cuda, save_dirname) + + +class TestRnnEncoderDecoder(unittest.TestCase): + def test_cuda(self): + with self.scope_prog_guard(): + main(use_cuda=True) + + def test_cpu(self): + with self.scope_prog_guard(): + main(use_cuda=False) + + @contextlib.contextmanager + def scope_prog_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + if __name__ == '__main__': - main() + unittest.main() diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py index 9c5cb667aed7456b54d32dcd650852cfdbd6cce1..6e0206d41db6265e926991fe35cd53513bd3417d 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py @@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid import paddle.v2 as paddle import contextlib import math +import numpy as np import sys @@ -43,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer.minimize(avg_cost) accuracy = fluid.layers.accuracy(input=prediction, label=label) - return avg_cost, accuracy + return avg_cost, accuracy, prediction def stacked_lstm_net(data, @@ -81,13 +82,18 @@ def stacked_lstm_net(data, adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer.minimize(avg_cost) accuracy = fluid.layers.accuracy(input=prediction, label=label) - return avg_cost, accuracy + return avg_cost, accuracy, prediction -def main(word_dict, net_method, use_cuda): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return +def create_random_lodtensor(lod, place, low, high): + data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") + res = fluid.LoDTensor() + res.set(data, place) + res.set_lod([lod]) + return res + +def train(word_dict, net_method, use_cuda, save_dirname=None): BATCH_SIZE = 128 PASS_NUM = 5 dict_dim = len(word_dict) @@ -96,7 +102,7 @@ def main(word_dict, net_method, use_cuda): data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") - cost, acc_out = net_method( + cost, acc_out, prediction = net_method( data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( @@ -116,6 +122,9 @@ def main(word_dict, net_method, use_cuda): fetch_list=[cost, acc_out]) print("cost=" + str(cost_val) + " acc=" + str(acc_val)) if cost_val < 0.4 and acc_val > 0.8: + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, ["words"], + prediction, exe) return if math.isnan(float(cost_val)): sys.exit("got NaN loss, training failed.") @@ -123,6 +132,49 @@ def main(word_dict, net_method, use_cuda): net_method.__name__)) +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + lod = [0, 4, 10] + word_dict = paddle.dataset.imdb.word_dict() + tensor_words = create_random_lodtensor( + lod, place, low=0, high=len(word_dict) - 1) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + assert feed_target_names[0] == "words" + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_words}, + fetch_list=fetch_targets, + return_numpy=False) + print(results[0].lod()) + np_data = np.array(results[0]) + print("Inference Shape: ", np_data.shape) + print("Inference results: ", np_data) + + +def main(word_dict, net_method, use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the trained model + save_dirname = "understand_sentiment.inference.model" + + train(word_dict, net_method, use_cuda, save_dirname) + infer(use_cuda, save_dirname) + + class TestUnderstandSentiment(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index f013d7f1551bdbfb2f725809e2fb4d7d686560fe..69bfbcee69a08f57e4754f1a94f85534be4baac6 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -1,6 +1,5 @@ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -16,14 +15,67 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import unittest import os +import numpy as np import math import sys -def main(use_cuda, is_sparse, parallel): - if use_cuda and not fluid.core.is_compiled_with_cuda(): +def create_random_lodtensor(lod, place, low, high): + data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64") + res = fluid.LoDTensor() + res.set(data, place) + res.set_lod([lod]) + return res + + +def infer(use_cuda, save_dirname=None): + if save_dirname is None: return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) - 1 + + # Setup input, by creating 4 words, and setting up lod required for + # lookup_table_op + lod = [0, 1] + first_word = create_random_lodtensor(lod, place, low=0, high=dict_size) + second_word = create_random_lodtensor(lod, place, low=0, high=dict_size) + third_word = create_random_lodtensor(lod, place, low=0, high=dict_size) + fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size) + + assert feed_target_names[0] == 'firstw' + assert feed_target_names[1] == 'secondw' + assert feed_target_names[2] == 'thirdw' + assert feed_target_names[3] == 'forthw' + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + results = exe.run(inference_program, + feed={ + feed_target_names[0]: first_word, + feed_target_names[1]: second_word, + feed_target_names[2]: third_word, + feed_target_names[3]: fourth_word + }, + fetch_list=fetch_targets, + return_numpy=False) + print(results[0].lod()) + np_data = np.array(results[0]) + print("Inference Shape: ", np_data.shape) + print("Inference results: ", np_data) + + +def train(use_cuda, is_sparse, parallel, save_dirname): PASS_NUM = 100 EMBED_SIZE = 32 HIDDEN_SIZE = 256 @@ -67,7 +119,7 @@ def main(use_cuda, is_sparse, parallel): act='softmax') cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) avg_cost = fluid.layers.mean(x=cost) - return avg_cost + return avg_cost, predict_word word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) @@ -79,13 +131,13 @@ def main(use_cuda, is_sparse, parallel): next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') if not parallel: - avg_cost = __network__( + avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): - avg_cost = __network__( + avg_cost, predict_word = __network__( map(pd.read_input, [ first_word, second_word, third_word, forth_word, next_word ])) @@ -113,6 +165,10 @@ def main(use_cuda, is_sparse, parallel): feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, [ + 'firstw', 'secondw', 'thirdw', 'forthw' + ], [predict_word], exe) return if math.isnan(float(avg_cost_np[0])): sys.exit("got NaN loss, training failed.") @@ -120,6 +176,14 @@ def main(use_cuda, is_sparse, parallel): raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) +def main(use_cuda, is_sparse, parallel): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + save_dirname = "word2vec.inference.model" + train(use_cuda, is_sparse, parallel, save_dirname) + infer(use_cuda, save_dirname) + + FULL_TEST = os.getenv('FULL_TEST', '0').lower() in ['true', '1', 't', 'y', 'yes', 'on'] SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster" @@ -142,7 +206,8 @@ def inject_test_method(use_cuda, is_sparse, parallel): with fluid.program_guard(prog, startup_prog): main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel) - if use_cuda and is_sparse and parallel: + # run only 2 cases: use_cuda is either True or False + if is_sparse == False and parallel == False: fn = __impl__ else: # skip the other test when on CI server diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index 7ad5e2c594f24999e298533b6c05ba688a935f0b..045db8390cd52689a2a803c3387c90776a44ee73 100644 --- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -15,6 +15,8 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid +import math +import sys # need to fix random seed and training data to compare the loss # value accurately calculated by the default and the memory optimization @@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM): if avg_loss_value[0] < 10.0: exit(0) # if avg cost less than 10.0, we think our code is good. + if math.isnan(float(avg_loss_value)): + sys.exit("got NaN loss, training failed.") exit(1) diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 26673afd83c48328c3f354e82bfa3725aa4805b5..9fbb36d3638bd537020247d6f762afd4ed5d402f 100644 --- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -18,6 +18,8 @@ import sys import paddle.v2 as paddle import paddle.v2.fluid as fluid +import math +import sys # need to fix random seed and training data to compare the loss # value accurately calculated by the default and the memory optimization @@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM): print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( pass_acc)) # this model is slow, so if we can train two mini batch, we think it works properly. + if i > 2: exit(0) + if math.isnan(float(loss)): + sys.exit("got NaN loss, training failed.") i += 1 exit(1) diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index ffd53e7a78142162317a677de49c1821635a65b5..48abaa8d87563b7132c5d8962bc33283a104e67a 100644 --- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor +import math +import sys dict_size = 30000 source_dict_dim = target_dict_dim = dict_size @@ -137,6 +139,8 @@ def main(): " avg_cost=" + str(avg_cost_val)) if batch_id > 2: exit(0) + if math.isnan(float(avg_cost_val)): + sys.exit("got NaN loss, training failed.") batch_id += 1 diff --git a/python/paddle/v2/fluid/tests/notest_csp.py b/python/paddle/v2/fluid/tests/notest_csp.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe234a20b5222eb85e6bcea2fcb05c53ddd57e9 --- /dev/null +++ b/python/paddle/v2/fluid/tests/notest_csp.py @@ -0,0 +1,37 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.v2.fluid as fluid + + +class TestCSPFramework(unittest.TestCase): + def daisy_chain(self): + n = 10000 + leftmost = fluid.make_channel(dtype=int) + right = leftmost + left = leftmost + with fluid.While(steps=n): + right = fluid.make_channel(dtype=int) + with fluid.go(): + fluid.send(left, 1 + fluid.recv(right)) + left = right + + with fluid.go(): + fluid.send(right, 1) + fluid.Print(fluid.recv(leftmost)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index 3f6d7070c2987d0557c60db84a2c679cd2cfe36b..f8475813c0cb7b8c8da8c9c7e0a4626615e609ac 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -326,7 +326,8 @@ class OpTest(unittest.TestCase): self.assertTrue( np.allclose( actual_t, expect_t, atol=atol), - "Output (" + out_name + ") has diff at " + str(place)) + "Output (" + out_name + ") has diff at " + str(place) + + str(actual_t) + str(expect_t)) if isinstance(expect, tuple): self.assertListEqual(actual.lod(), expect[1], "Output (" + out_name + diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py index e71c3a290c9b120749a5190a246c5d76b7bf1955..970f57ed0008b0d7d99ad8b5de1cb7895239ed2c 100644 --- a/python/paddle/v2/fluid/tests/test_cpp_reader.py +++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py @@ -32,31 +32,43 @@ create_random_data_generator_op = block.append_op( "min": 0.0, "max": 1.0 }) +shuffle_reader = block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader") +shuffle_reader.desc.set_lod_levels([0, 0]) -out1 = block.create_var( - type=fluid.core.VarDesc.VarType.LOD_TENSOR, - name="Out1", - shape=[10, 2], - dtype="float32", - lod_level=1) -out2 = block.create_var( - type=fluid.core.VarDesc.VarType.LOD_TENSOR, - name="Out2", - shape=[10, 1], - dtype="float32", - lod_level=1) +create_shuffle_reader_op = block.append_op( + type="create_shuffle_reader", + inputs={"UnderlyingReader": random_reader}, + outputs={"Out": shuffle_reader}, + attrs={"buffer_size": 7}) + +batch_reader = block.create_var( + type=fluid.core.VarDesc.VarType.READER, name="BatchReader") +batch_reader.desc.set_lod_levels([1, 1]) + +create_batch_reader_op = block.append_op( + type="create_batch_reader", + inputs={"UnderlyingReader": shuffle_reader}, + outputs={"Out": batch_reader}, + attrs={"batch_size": 10}) + +out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1") +out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2") read_op = block.append_op( - type="read", - inputs={"Reader": random_reader}, + type="read", inputs={"Reader": batch_reader}, outputs={"Out": [out1, out2]}) place = fluid.CPUPlace() exe = fluid.Executor(place) -[res1, res2] = exe.run(prog, fetch_list=[out1, out2]) +[res1, res2] = exe.run(prog, fetch_list=[out1, out2], return_numpy=False) + +test_pass = res1.lod() == [range(0, 11)] and res1.lod() == [ + range(0, 11) +] and np.array(res1).shape == (10, 2) and np.array(res2).shape == (10, 1) -if len(res1) == 0 or len(res2) == 0: +if not test_pass: exit(1) exit(0) diff --git a/python/paddle/v2/fluid/tests/test_cumsum_op.py b/python/paddle/v2/fluid/tests/test_cumsum_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e45ef457306bbdd33508e560cb8d339e801bb5e4 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_cumsum_op.py @@ -0,0 +1,127 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSumOp1(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.attrs = {'axis': 2} + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp2(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.attrs = {'axis': -1, 'reverse': True} + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.outputs = { + 'Out': np.flip( + np.flip( + self.inputs['X'], axis=2).cumsum(axis=2), axis=2) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp3(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.attrs = {'axis': 1} + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp4(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.attrs = {'axis': 0} + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp5(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.inputs = {'X': np.random.random((5, 6)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp7(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.inputs = {'X': np.random.random((6)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSumOp8(OpTest): + def setUp(self): + self.op_type = "cumsum" + self.attrs = {'axis': 2, "exclusive": True} + a = np.random.random((5, 6, 3)).astype("float64") + self.inputs = {'X': a} + self.outputs = { + 'Out': np.concatenate( + (np.zeros( + (5, 6, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)), + axis=2) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 68cf8673cd46677065588f652482cd0df08b3450..4460ffaf9c46966178497419a35ef4044464ac9f 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core from paddle.v2.fluid.op import Operator from paddle.v2.fluid.framework import grad_var_name +np.random.random(123) + def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): x_shape = x.shape @@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): grad_x = dx_end + d_mean + d_std - grad_y.shape = x_shape - x.shape = x_shape + grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape scale.shape = scale_shape + var.shape, mean.shape = [N, ], [N, ] return grad_x, d_scale, d_bias @@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None): class TestLayerNormdOp(OpTest): def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue( - np.allclose( - np.array(tensor).reshape(np_array.shape), np_array, atol=atol), - msg) + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) def __assert_grad_close(self, tensor, @@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest): name, place, max_relative_error=0.02): - a = np.array(tensor).reshape(np_array.shape) + a = np.array(tensor) b = np_array abs_a = np.abs(a) abs_a[abs_a < 1e-5] = 1 @@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest): x_shape = shape D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) scale_shape = [D] - np.random.random(123) + x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 480ee7091579ba171ca957cb4d25f0034e0534c0..dc6b84dcdc04dd185d97c3cc4b9f00305a911efb 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase): type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) opts, _ = sgd_optimizer.minimize(mean_out, init_program) - self.assertEqual(len(opts), 1) - sgd_op = opts[0] - self.assertEqual(sgd_op.type, "sgd") + self.assertEqual(len(opts), 3) + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "sgd"]) def test_sgd_optimizer_with_global_step(self): init_program = framework.Program() @@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase): sgd_optimizer = optimizer.SGDOptimizer( learning_rate=learning_rate, global_step=global_step) opts, _ = sgd_optimizer.minimize(mean_out, init_program) - self.assertEqual(len(opts), 2) - sgd_op = opts[0] - self.assertEqual(sgd_op.type, "sgd") - increment_op = opts[1] - self.assertEqual(increment_op.type, "increment") + self.assertEqual(len(opts), 4) + self.assertEqual( + [op.type for op in opts], + ["fill_constant", "elementwise_mul", "sgd", "increment"]) # Check init_program init_ops = init_program.global_block().ops @@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase): self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( params_grads, mul_out, init_program) - self.assertEqual(len(opts), 1) - sgd_op = opts[0] - self.assertEqual(sgd_op.type, "momentum") + self.assertEqual(len(opts), 3) + sgd_op = opts[-1] + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "momentum"]) self.assertFalse(sgd_op.attr('use_nesterov')) # Check accumulators @@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase): self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( params_grads, mul_out, init_program) - self.assertEqual(len(opts), 1) - sgd_op = opts[0] - self.assertEqual(sgd_op.type, "momentum") + self.assertEqual(len(opts), 3) + sgd_op = opts[-1] + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "momentum"]) self.assertTrue(sgd_op.attr('use_nesterov')) # Check accumulators @@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase): self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, init_program) - self.assertEqual(len(opts), 1) - adagrad_op = opts[0] - self.assertEqual(adagrad_op.type, "adagrad") + self.assertEqual(len(opts), 3) + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "adagrad"]) # Check accumulators accumulators = adagrad_optimizer.get_accumulators() @@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase): self.assertEqual(len(adam_optimizer.get_accumulators()), 0) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, init_program) - self.assertEqual(len(opts), 3) - adam_op = opts[0] - self.assertEqual(adam_op.type, "adam") + self.assertEqual(len(opts), 5) + self.assertEqual( + [op.type for op in opts], + ["fill_constant", "elementwise_mul", "adam", "scale", "scale"]) # Check accumulators accumulators = adam_optimizer.get_accumulators() @@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase): self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, init_program) - self.assertEqual(len(opts), 2) - adam_op = opts[0] - self.assertEqual(adam_op.type, "adamax") + self.assertEqual(len(opts), 4) + self.assertEqual( + [op.type for op in opts], + ["fill_constant", "elementwise_mul", "adamax", "scale"]) # Check accumulators accumulators = adamax_optimizer.get_accumulators() @@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) opts = decayed_adagrad_optimizer.create_optimization_pass( params_grads, mul_out, init_program) - self.assertEqual(len(opts), 1) - decayed_adagrad_op = opts[0] - self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad") + self.assertEqual(len(opts), 3) + self.assertEqual( + [op.type for op in opts], + ["fill_constant", "elementwise_mul", "decayed_adagrad"]) # Check accumulators accumulators = decayed_adagrad_optimizer.get_accumulators()