提交 31f598fc 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into improve_concat_split_op

paddle/operators/check_t.save
paddle/operators/check_tensor.ls
paddle/operators/tensor.save
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store *.DS_Store
build/ build/
build_doc/ build_doc/
......
...@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release") ...@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
......
...@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME) ...@@ -179,20 +179,24 @@ function(cc_library TARGET_NAME)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (cc_library_SRCS) if(cc_library_SRCS)
if (cc_library_SHARED OR cc_library_shared) # build *.so if(cc_library_SHARED OR cc_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
else() else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif() endif()
if (cc_library_DEPS) if(cc_library_DEPS)
# Don't need link libwarpctc.so # Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;") if("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc) list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc) add_dependencies(${TARGET_NAME} warpctc)
endif() endif()
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
# cpplint code style # cpplint code style
......
...@@ -323,6 +323,12 @@ batch_norm ...@@ -323,6 +323,12 @@ batch_norm
.. autofunction:: paddle.v2.fluid.layers.batch_norm .. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex: :noindex:
layer_norm
----------
.. autofunction:: paddle.v2.fluid.layers.layer_norm
:noindex:
beam_search_decode beam_search_decode
------------------ ------------------
......
RNN相关模型 RNN模型
=========== ===========
.. toctree:: .. toctree::
......
...@@ -8,5 +8,4 @@ PaddlePaddle 文档 ...@@ -8,5 +8,4 @@ PaddlePaddle 文档
build_and_install/index_cn.rst build_and_install/index_cn.rst
howto/index_cn.rst howto/index_cn.rst
dev/index_cn.rst dev/index_cn.rst
api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
...@@ -8,4 +8,3 @@ PaddlePaddle Documentation ...@@ -8,4 +8,3 @@ PaddlePaddle Documentation
build_and_install/index_en.rst build_and_install/index_en.rst
howto/index_en.rst howto/index_en.rst
dev/index_en.rst dev/index_en.rst
api/index_en.rst
...@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ...@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) { : prog_(prog), desc_(desc) {
need_update_ = true; need_update_ = true;
for (auto &op : other.ops_) { for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op, this)); ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
} }
for (auto &it : other.vars_) { for (auto &it : other.vars_) {
auto *var = new VarDesc(*it.second); auto *var = new VarDesc(*it.second);
vars_[it.first].reset(var); vars_[it.first].reset(var);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -25,6 +25,26 @@ using paddle::framework::CloseChannel; ...@@ -25,6 +25,26 @@ using paddle::framework::CloseChannel;
using paddle::framework::details::Buffered; using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered; using paddle::framework::details::UnBuffered;
void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
TEST(Channel, MakeAndClose) { TEST(Channel, MakeAndClose) {
using paddle::framework::details::Buffered; using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered; using paddle::framework::details::UnBuffered;
...@@ -137,9 +157,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { ...@@ -137,9 +157,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Receive(&out), EXPECT_EQ(ch->Receive(&out),
false); // after receiving residual values, return zeros. false); // receiving on closed channel should return false
// Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't
// define zero values like Go does.
} }
delete ch; delete ch;
} }
...@@ -158,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -158,7 +176,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
sum += i; sum += i;
} }
}); });
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
EXPECT_EQ(sum, 45U); EXPECT_EQ(sum, 45U);
CloseChannel(ch); CloseChannel(ch);
...@@ -166,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -166,31 +184,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
delete ch; delete ch;
} }
TEST(Channel, SimpleUnbufferedChannelTest) { TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
unsigned sum_send = 0; RecevingOrderEqualToSendingOrder(ch);
std::thread t([&]() { }
for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
CloseChannel(ch); TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
t.join(); auto ch = MakeChannel<int>(10);
EXPECT_EQ(sum_send, 10U); RecevingOrderEqualToSendingOrder(ch);
delete ch;
} }
// This tests that closing a buffered channel also unblocks void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5; size_t num_threads = 5;
std::thread t[num_threads]; std::thread t[num_threads];
bool thread_ended[num_threads]; bool thread_ended[num_threads];
...@@ -201,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { ...@@ -201,15 +205,14 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
int data; int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false); EXPECT_EQ(ch->Receive(&data), false);
*p = true; *p = true;
}, },
&thread_ended[i]); &thread_ended[i]);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all threads are blocked // Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
...@@ -218,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { ...@@ -218,7 +221,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
// This should unblock all receivers // This should unblock all receivers
CloseChannel(ch); CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < num_threads; i++) {
...@@ -226,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { ...@@ -226,13 +229,12 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
} }
// This tests that closing a buffered channel also unblocks void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
// any senders waiting for channel to have write space using paddle::framework::details::Buffered;
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { using paddle::framework::details::UnBuffered;
auto ch = MakeChannel<int>(1);
size_t num_threads = 5; size_t num_threads = 5;
std::thread t[num_threads]; std::thread t[num_threads];
bool thread_ended[num_threads]; bool thread_ended[num_threads];
...@@ -252,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { ...@@ -252,34 +254,56 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
} }
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked if (dynamic_cast<Buffered<int> *>(ch)) {
int ct = 0; // If ch is Buffered, atleast 4 threads must be blocked.
for (size_t i = 0; i < num_threads; i++) { int ct = 0;
if (thread_ended[i] == false) ct++; for (size_t i = 0; i < num_threads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
} }
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread // Explicitly close the thread
// This should unblock all senders // This should unblock all senders
CloseChannel(ch); CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
// Verify that only 1 send was successful if (dynamic_cast<Buffered<int> *>(ch)) {
ct = 0; // Verify that only 1 send was successful
for (size_t i = 0; i < num_threads; i++) { int ct = 0;
if (send_success[i]) ct++; for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} }
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < num_threads; i++) t[i].join();
}
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksReceiversTest(ch);
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksSendersTest(ch);
delete ch; delete ch;
} }
...@@ -287,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { ...@@ -287,40 +311,7 @@ TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
// unblocks any receivers waiting for senders // unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
size_t num_threads = 5; ChannelCloseUnblocksReceiversTest(ch);
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch; delete ch;
} }
...@@ -328,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { ...@@ -328,40 +319,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
// unblocks any senders waiting for senders // unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
size_t num_threads = 5; ChannelCloseUnblocksReceiversTest(ch);
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
EXPECT_EQ(ch->Send(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch; delete ch;
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -25,6 +25,14 @@ namespace paddle { ...@@ -25,6 +25,14 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Four of the properties of Buffered Channel:
// - A send to a full channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from an empty channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T> template <typename T>
class Buffered : public paddle::framework::Channel<T> { class Buffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t); friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
...@@ -23,6 +23,13 @@ namespace paddle { ...@@ -23,6 +23,13 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Four of the properties of UnBuffered Channel:
// - A send to a channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from a channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T> template <typename T>
class UnBuffered : public paddle::framework::Channel<T> { class UnBuffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t); friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
......
...@@ -122,6 +122,11 @@ class GradOpDescMakerBase { ...@@ -122,6 +122,11 @@ class GradOpDescMakerBase {
return it->second; return it->second;
} }
template <typename T>
inline const T& Attr(const std::string& name) const {
return boost::get<T>(GetAttr(name));
}
std::string ForwardOpType() const { return this->fwd_op_.Type(); } std::string ForwardOpType() const { return this->fwd_op_.Type(); }
private: private:
......
...@@ -46,29 +46,7 @@ namespace framework { ...@@ -46,29 +46,7 @@ namespace framework {
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
struct LoD : public std::vector<Vector<size_t>> { using LoD = std::vector<Vector<size_t>>;
using std::vector<Vector<size_t>>::vector;
platform::Place place() const {
if (this->size() == 0) {
// Not Initialze Yet.
return platform::CPUPlace();
} else {
return this->front().place();
}
}
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
void CopyToPeer(platform::Place place) {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyToPeer(place);
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <paddle/platform/place.h>
__global__ void test(size_t* a, int size) { __global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
...@@ -36,10 +37,9 @@ TEST(LoD, data) { ...@@ -36,10 +37,9 @@ TEST(LoD, data) {
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11})); lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0]; auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size()); paddle::platform::CUDAPlace gpu(0);
test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) { for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2); EXPECT_EQ(v[i], i * 2);
} }
...@@ -63,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -63,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size()); test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
...@@ -17,176 +17,347 @@ ...@@ -17,176 +17,347 @@
#include <initializer_list> #include <initializer_list>
#include <vector> #include <vector>
#include "paddle/memory/memcpy.h" #include "paddle/framework/tensor.h"
#include "paddle/memory/memory.h" #include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h" #include "glog/logging.h"
#include "paddle/platform/place.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
/** // Vector<T> implements the std::vector interface, and can get Data or
* @brief Vector support both cpu and gpu. // MutableData from any place. The data will be synced implicitly inside.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T> template <typename T>
class Vector : public std::vector<T> { class Vector {
public: public:
using std::vector<T>::vector; using value_type = T;
// Default ctor. Create empty Vector
Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T& value = T()) {
if (count == 0) {
InitEmpty();
} else {
resize(count);
T* ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
}
}
// Ctor with init_list
Vector(std::initializer_list<T> init) {
if (init.size() == 0) {
InitEmpty();
} else {
InitByIter(init.size(), init.begin(), init.end());
}
}
// implicit cast from std::vector.
template <typename U>
Vector(const std::vector<U>& dat) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
}
}
// Copy ctor
Vector(const Vector<T>& other) { this->operator=(other); }
// Copy operator
Vector<T>& operator=(const Vector<T>& other) {
if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this;
}
// Move ctor
Vector(Vector<T>&& other) {
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
}
}
// CPU data access method. Mutable.
T& operator[](size_t i) {
MutableCPU();
return const_cast<T*>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable.
const T& operator[](size_t i) const {
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
// std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; }
T* begin() { return &this->operator[](0); }
T* end() { return &this->operator[](size()); }
T& front() { return *begin(); }
T& back() {
auto it = end();
--it;
return *it;
}
const T* begin() const { return &this->operator[](0); }
const T* end() const { return &this->operator[](size()); }
const T& back() const {
auto it = end();
--it;
return *it;
}
T* data() { return begin(); }
const T* data() const { return begin(); }
const T& front() const { return *begin(); }
// end of std::vector iterator methods
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
InitByIter(end - begin, begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
}
Vector() {} // extend a vector by iterator.
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT // NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
size_t pre_size = size_;
resize(pre_size + (end - begin));
T* ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
}
inline platform::Place place() const { return place_; } // resize the vector
void resize(size_t size) {
if (size + 1 < capacity()) {
size_ = size;
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T* ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T* old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
}
}
/*! Return a pointer to constant memory block. */ // get cuda ptr. immutable
inline const T *data(platform::Place place) const; const T* CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return cuda_vec_.data<T>();
}
/*! Return a pointer to mutable memory block. */ // get cuda ptr. mutable
inline T *mutable_data(platform::Place place); T* CUDAMutableData(platform::Place place) {
const T* ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T*>(ptr);
}
// TODO(dzhwinter): below interfaces should be removed // clear
/* Get device vector */ void clear() {
T *cuda_data() { size_ = 0;
CopyToCUDA(); flag_ = kDirty | kDataInCPU;
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_.get());
} }
/* Get host vector */ size_t capacity() const {
T *data() { return std::vector<T>::data(); } return cpu_vec_.memory_size() / SizeOfType(typeid(T));
const T *data() const { return std::vector<T>::data(); } }
// reserve data
void reserve(size_t size) {
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
T *data(const platform::Place &place) { // the unify method to access CPU or CUDA data. immutable.
if (platform::is_cpu_place(place)) { const T* Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
return data(); return data();
}
}
// the unify method to access CPU or CUDA data. mutable.
T* MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) {
return CUDAMutableData(place);
} else { } else {
return cuda_data(); return data();
} }
} }
/* Synchronize host vector to device vector */ // implicit cast operator. Vector can be cast to std::vector implicitly.
void CopyToCUDA(); operator std::vector<T>() const {
/* Synchronize device vector to host vector */ std::vector<T> result;
void CopyFromCUDA(); result.resize(size());
/* Switch device vector location */ std::copy(begin(), end(), result.begin());
void CopyToPeer(platform::Place); return result;
}
bool operator==(const Vector<T>& other) const {
if (size() != other.size()) return false;
for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}
}
return true;
}
private: private:
std::shared_ptr<void> cuda_ptr_; void InitEmpty() {
size_t cuda_size_ = 0; // device vector numel size_ = 0;
platform::CUDAPlace place_; flag_ = kDataInCPU;
}; }
template <typename T> template <typename Iter>
inline const T *Vector<T>::data(platform::Place place) const { void InitByIter(size_t size, Iter begin, Iter end) {
if (platform::is_cpu_place(place)) { platform::Place cpu = platform::CPUPlace();
return std::vector<T>::data(); T* ptr = this->cpu_vec_.template mutable_data<T>(
} else if (platform::is_gpu_place(place)) { framework::make_ddim({static_cast<int64_t>(size)}), cpu);
if (cuda_ptr_ == nullptr) { for (size_t i = 0; i < size; ++i) {
return nullptr; *ptr++ = *begin++;
} }
if (boost::get<platform::CUDAPlace>(place) == place_) { flag_ = kDataInCPU | kDirty;
return static_cast<const T *>(cuda_ptr_.get()); size_ = size;
}
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else { } else {
PADDLE_THROW( if (!IsInCUDA()) {
"Unmatched place. Please use `mutable_data` copy lod to the target " // Even data is not dirty. However, data is not in CUDA. Copy data.
"Place first."); Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
} }
} else {
PADDLE_THROW("Unsupport Place.");
} }
}
template <typename T> void ImmutableCPU() const {
inline T *Vector<T>::mutable_data(platform::Place place) { if (IsDirty() &&
if (platform::is_cpu_place(place)) { !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
return std::vector<T>::data(); CopyToCPU();
} else if (platform::is_gpu_place(place)) { UnsetFlag(kDirty);
if (boost::get<platform::CUDAPlace>(place) != place_) { }
place_ = boost::get<platform::CUDAPlace>(place); SetFlag(kDataInCPU);
} }
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
return static_cast<T *>(cuda_ptr_.get());
#else
return nullptr;
#endif
} else {
PADDLE_THROW("Unsupport Place.");
}
}
template <typename T> void UnsetFlag(int flag) const { flag_ &= ~flag; }
void Vector<T>::CopyToCUDA() { void SetFlag(int flag) const { flag_ |= flag; }
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T> bool IsDirty() const { return flag_ & kDirty; }
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommitted cuda data.";
return;
}
this->resize(cuda_size_);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_.get()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T> bool IsInCUDA() const { return flag_ & kDataInCUDA; }
void Vector<T>::CopyToPeer(platform::Place place) {
#ifdef PADDLE_WITH_CUDA bool IsInCPU() const { return flag_ & kDataInCPU; }
if (boost::get<platform::CUDAPlace>(place) != place_) {
place_ = boost::get<platform::CUDAPlace>(place); static void WaitPlace(const platform::Place place) {
} if (platform::is_gpu_place(place)) {
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { platform::DeviceContextPool::Instance()
cuda_ptr_.reset( .Get(boost::get<platform::CUDAPlace>(place))
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)), ->Wait();
memory::PlainDeleter<void, platform::CUDAPlace>(place_)); }
} }
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); mutable int flag_;
auto *ctx = pool.GetByPlace(place_); mutable Tensor cpu_vec_;
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), mutable Tensor cuda_vec_;
static_cast<const void *>(this->data()), size_t size_;
this->size() * sizeof(T), ctx->stream()); };
ctx->Wait();
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -11,62 +11,83 @@ ...@@ -11,62 +11,83 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "gtest/gtest.h"
#include "paddle/framework/init.h" #include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/framework/mixed_vector.h" #include "paddle/framework/mixed_vector.h"
#include "paddle/platform/gpu_info.h"
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
template <typename T> template <typename T>
__global__ void test(T* data, int size) { using vec = paddle::framework::Vector<T>;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) { TEST(mixed_vector, CPU_VECTOR) {
data[i] *= 2; vec<int> tmp;
for (int i = 0; i < 10; ++i) {
tmp.push_back(i);
}
ASSERT_EQ(tmp.size(), 10);
vec<int> tmp2;
tmp2 = tmp;
ASSERT_EQ(tmp2.size(), 10);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp2[i], i);
ASSERT_EQ(tmp2[i], tmp[i]);
}
int cnt = 0;
for (auto& t : tmp2) {
ASSERT_EQ(t, cnt);
++cnt;
} }
} }
TEST(Vector, Normal) { static __global__ void multiply_10(int* ptr) {
// fill the device context pool. for (int i = 0; i < 10; ++i) {
InitDevices(); ptr[i] *= 10;
}
}
cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
}
Vector<size_t> vec({1, 2, 3}); TEST(mixed_vector, GPU_VECTOR) {
size_t* ptr = vec.data(); vec<int> tmp;
for (size_t i = 0; i < vec.size(); ++i) { for (int i = 0; i < 10; ++i) {
EXPECT_EQ(vec[i], *(ptr + i)); tmp.push_back(i);
} }
ASSERT_EQ(tmp.size(), 10);
paddle::platform::CUDAPlace gpu(0);
vec.clear(); multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3}; for (int i = 0; i < 10; ++i) {
for (size_t i = 0; i < v.size(); ++i) { ASSERT_EQ(tmp[i], i * 10);
EXPECT_EQ(v[i], vec[i]);
} }
} }
TEST(Vector, MultipleCopy) { TEST(mixed_vector, MultiGPU) {
InitDevices(); if (paddle::platform::GetCUDADeviceCount() < 2) {
Vector<size_t> vec({1, 2, 3}); LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
CUDAPlace place(0); "GPUs in your machine.";
vec.mutable_data(place); return;
auto vec2 = Vector<size_t>(vec); }
{
const size_t* ptr = vec2.data(CPUPlace()); vec<int> tmp;
for (size_t i = 0; i < vec2.size(); ++i) { for (int i = 0; i < 10; ++i) {
EXPECT_EQ(*(ptr + i), vec[i]); tmp.push_back(i);
}
} }
test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size()); ASSERT_EQ(tmp.size(), 10);
vec2.CopyFromCUDA(); paddle::platform::CUDAPlace gpu0(0);
{ paddle::platform::SetDeviceId(0);
const size_t* ptr = vec2.data(CPUPlace()); multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
for (size_t i = 0; i < vec2.size(); ++i) { paddle::platform::CUDAPlace gpu1(1);
EXPECT_EQ(*(ptr + i), vec[i] * 2); auto* gpu1_ptr = tmp.MutableData(gpu1);
} paddle::platform::SetDeviceId(1);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 100);
} }
} }
...@@ -125,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block) ...@@ -125,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
// restore attrs_ // restore attrs_
for (const proto::OpDesc::Attr &attr : desc_.attrs()) { for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
std::string attr_name = attr.name(); std::string attr_name = attr.name();
// The sub_block referred to by the BLOCK attr hasn't been added
// to ProgramDesc class yet, we skip setting BLOCK attr here.
if (attr.type() != proto::AttrType::BLOCK) { if (attr.type() != proto::AttrType::BLOCK) {
attrs_[attr_name] = GetAttrValue(attr); attrs_[attr_name] = GetAttrValue(attr);
} else {
auto bid = attr.block_idx();
attrs_[attr_name] = prog->MutableBlock(bid);
} }
} }
this->block_ = block; this->block_ = block;
......
...@@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar { ...@@ -143,7 +143,7 @@ class OpKernelRegistrar : public Registrar {
/** /**
* Macro to register Operator. When the input is duplicable, you should * Macro to register Operator. When the input is duplicable, you should
* use REGISTER_OP_EX with deop_empty_grad=false instead. * use REGISTER_OP_EX with drop_empty_grad=false instead.
*/ */
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \ grad_op_class) \
......
...@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() { ...@@ -43,11 +43,20 @@ ProgramDesc::ProgramDesc() {
ProgramDesc::ProgramDesc(const ProgramDesc &o) { ProgramDesc::ProgramDesc(const ProgramDesc &o) {
desc_ = o.desc_; desc_ = o.desc_;
for (int i = 0; i < desc_.blocks_size(); ++i) { for (int i = 0; i < desc_.blocks_size(); ++i) {
auto *block = desc_.mutable_blocks(i); auto *block = desc_.mutable_blocks(i);
blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
} }
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
}
}
}
}
} }
ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
...@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { ...@@ -55,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
for (auto &block_desc : *desc_.mutable_blocks()) { for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc)); blocks_.emplace_back(new BlockDesc(this, &block_desc));
} }
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
}
}
}
}
} }
ProgramDesc::ProgramDesc(const std::string &binary_str) { ProgramDesc::ProgramDesc(const std::string &binary_str) {
......
...@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) { ...@@ -49,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
return false; return false;
} }
void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, int GetSubBlockIndex(const proto::OpDesc& op_desc) {
int block_id) { for (auto& attr : op_desc.attrs()) {
// TODO(tonyyang-svail): if (attr.type() == proto::AttrType::BLOCK) {
// - will change to use multiple blocks for RNN op and Cond Op PADDLE_ENFORCE(attr.has_block_idx());
return attr.block_idx();
}
}
return -1;
}
bool HasSubBlock(const proto::OpDesc& op_desc) {
return GetSubBlockIndex(op_desc) > 0;
}
// block_id is the idx of the current block in the input desc
// parent_block_id is the idx of the parent of the current block
// in the output desc, -1 means the current block is global block
// dependent_vars is passed recursively from the parent block to
// the child block to help pruning
void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
int block_id, int parent_block_id,
std::set<std::string>& dependent_vars) {
auto& block = input.blocks(block_id); auto& block = input.blocks(block_id);
auto& ops = block.ops(); auto& ops = block.ops();
...@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -72,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
expect_fetch = (op_desc.type() == kFetchOpType); expect_fetch = (op_desc.type() == kFetchOpType);
} }
std::set<std::string> dependent_vars;
std::vector<bool> should_run; std::vector<bool> should_run;
for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
auto& op_desc = *op_iter; auto& op_desc = *op_iter;
if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) { if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
// insert its input to the dependency graph // insert its input to the dependency graph
for (auto& var : op_desc.inputs()) { for (auto& var : op_desc.inputs()) {
...@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -84,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
dependent_vars.insert(argu); dependent_vars.insert(argu);
} }
} }
should_run.push_back(true); should_run.push_back(true);
} else { } else {
should_run.push_back(false); should_run.push_back(false);
...@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -95,45 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
// we reverse the should_run vector // we reverse the should_run vector
std::reverse(should_run.begin(), should_run.end()); std::reverse(should_run.begin(), should_run.end());
*output = input; // copy the current block from input to output
auto* op_field = output->mutable_blocks(block_id)->mutable_ops(); auto* block_field = output->mutable_blocks();
*block_field->Add() = input.blocks(block_id);
int output_block_id = output->blocks_size() - 1;
auto* output_block = output->mutable_blocks(output_block_id);
output_block->set_idx(output_block_id);
output_block->set_parent_idx(parent_block_id);
auto* op_field = output_block->mutable_ops();
op_field->Clear(); op_field->Clear();
for (size_t i = 0; i < should_run.size(); ++i) { for (size_t i = 0; i < should_run.size(); ++i) {
if (should_run[i]) { if (should_run[i]) {
*op_field->Add() = input.blocks(block_id).ops(i); auto* op = op_field->Add();
*op = input.blocks(block_id).ops(i);
if (HasSubBlock(*op)) {
// create sub_block_dependent_vars here to help prune the sub block
std::set<std::string> sub_block_dependent_vars;
for (auto& var : op->inputs()) {
for (auto& argu : var.arguments()) {
sub_block_dependent_vars.insert(argu);
}
}
for (auto& var : op->outputs()) {
for (auto& argu : var.arguments()) {
sub_block_dependent_vars.insert(argu);
}
}
// GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
// output_block_id is the idx of the current block in the output desc
prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
sub_block_dependent_vars);
}
} }
} }
// remove the VarDescs in BlockDesc that are not referenced in // remove the VarDescs in BlockDesc that are not referenced in
// the pruned OpDescs // the pruned OpDescs
std::unordered_map<std::string, proto::VarDesc> var_map; std::unordered_map<std::string, proto::VarDesc> var_map;
auto* var_field = output->mutable_blocks(block_id)->mutable_vars(); auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
for (const auto& var : *var_field) { for (const auto& var : *var_field) {
var_map[var.name()] = var; var_map[var.name()] = var;
} }
var_field->Clear(); std::set<std::string> var_names;
for (const auto& op : *op_field) { for (const auto& op : *op_field) {
// add VarDescs of all input arguments for each OpDesc
auto& input_field = op.inputs(); auto& input_field = op.inputs();
for (auto& input_var : input_field) { for (auto& input_var : input_field) {
for (auto& arg : input_var.arguments()) { for (auto& arg : input_var.arguments()) {
*var_field->Add() = var_map[arg]; if (var_map.count(arg) != 0) {
var_names.insert(arg);
}
} }
} }
// add VarDescs of all output arguments for each OpDesc
auto& output_field = op.outputs(); auto& output_field = op.outputs();
for (auto& output_var : output_field) { for (auto& output_var : output_field) {
for (auto& arg : output_var.arguments()) { for (auto& arg : output_var.arguments()) {
*var_field->Add() = var_map[arg]; if (var_map.count(arg) != 0) {
var_names.insert(arg);
}
} }
} }
} }
var_field->Clear();
for (const auto& name : var_names) {
*var_field->Add() = var_map[name];
}
} }
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
prune_impl(input, output, 0); std::set<std::string> dependent_vars;
output->clear_blocks();
prune_impl(input, output, 0, -1, dependent_vars);
} }
void inference_optimize_impl(const proto::ProgramDesc& input, void inference_optimize_impl(const proto::ProgramDesc& input,
......
...@@ -120,6 +120,7 @@ class Tensor { ...@@ -120,6 +120,7 @@ class Tensor {
return holder_->type(); return holder_->type();
} }
// memory size returns the holding memory size in byte.
size_t memory_size() const; size_t memory_size() const;
inline void check_memory_size() const; inline void check_memory_size() const;
......
...@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> { ...@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
}; };
static inline size_t SizeOfType(std::type_index type) { static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor; SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
size_t size = functor(type); size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size; return size;
...@@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) { ...@@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) {
inline void Tensor::check_memory_size() const { inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_LE(
holder_->size(), memory_size() + offset_, numel() * SizeOfType(type()), memory_size(),
"Tensor's dims_ is out of bound. Call Tensor::mutable_data " "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n" "first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored."); "or maybe the required data-type mismatches the data already stored.");
} }
inline size_t Tensor::memory_size() const { inline size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : numel() * SizeOfType(type()); return holder_ == nullptr ? 0UL : holder_->size() - offset_;
} }
template <typename T> template <typename T>
......
...@@ -4,19 +4,14 @@ cc_library(paddle_fluid_api ...@@ -4,19 +4,14 @@ cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library # Create static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library # Create shared library
add_library(paddle_fluid_shared SHARED io.cc) cc_library(paddle_fluid_shared SHARED
SRCS io.cc
target_circle_link_libraries(paddle_fluid_shared DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
ARCHIVE_START set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
${GLOB_OP_LIB}
${FLUID_CORE_MODULES}
ARCHIVE_END)
SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(WITH_TESTING) if(WITH_TESTING)
add_subdirectory(tests/book) add_subdirectory(tests/book)
......
...@@ -21,6 +21,17 @@ limitations under the License. */ ...@@ -21,6 +21,17 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void ReadBinaryFile(const std::string& filename, std::string& contents) {
VLOG(3) << "loading model from " << filename;
std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
inputfs.seekg(0, std::ios::end);
contents.clear();
contents.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
inputfs.read(&contents[0], contents.size());
inputfs.close();
}
bool IsParameter(const framework::VarDesc* var, bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) { const framework::ProgramDesc& main_program) {
if (var->Persistable()) { if (var->Persistable()) {
...@@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var, ...@@ -44,12 +55,15 @@ bool IsParameter(const framework::VarDesc* var,
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const framework::ProgramDesc& main_program) { const std::string& param_filename) {
const framework::BlockDesc& global_block = main_program.Block(0); const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc(); framework::ProgramDesc* load_program = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program->MutableBlock(0); framework::BlockDesc* load_block = load_program->MutableBlock(0);
std::vector<std::string> paramlist;
for (auto* var : global_block.AllVars()) { for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) { if (IsParameter(var, main_program)) {
VLOG(3) << "parameter's name: " << var->Name(); VLOG(3) << "parameter's name: " << var->Name();
...@@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor, ...@@ -61,15 +75,33 @@ void LoadPersistables(framework::Executor& executor,
new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true); new_var->SetPersistable(true);
// append_op if (!param_filename.empty()) {
framework::OpDesc* op = load_block->AppendOp(); paramlist.push_back(new_var->Name());
op->SetType("load"); } else {
op->SetOutput("Out", {new_var->Name()}); // append_op
op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); framework::OpDesc* op = load_block->AppendOp();
op->CheckAttrs(); op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
} }
} }
if (!param_filename.empty()) {
// sort paramlist to have consistent ordering
std::sort(paramlist.begin(), paramlist.end());
// append just the load_combine op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load_combine");
op->SetOutput("Out", paramlist);
op->SetAttr("file_path", {param_filename});
op->CheckAttrs();
}
executor.Run(*load_program, &scope, 0, true, true); executor.Run(*load_program, &scope, 0, true, true);
VLOG(3) << "Ran loading successfully";
delete load_program; delete load_program;
} }
...@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor, ...@@ -77,20 +109,29 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const std::string& dirname) { const std::string& dirname) {
std::string model_filename = dirname + "/__model__"; std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str; std::string program_desc_str;
inputfs.seekg(0, std::ios::end); ReadBinaryFile(model_filename, program_desc_str);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg); std::unique_ptr<framework::ProgramDesc> main_program(
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size(); new framework::ProgramDesc(program_desc_str));
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close(); LoadPersistables(executor, scope, *main_program, dirname, "");
return main_program;
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor,
framework::Scope& scope,
const std::string& prog_filename,
const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
LoadPersistables(executor, scope, dirname, *main_program); LoadPersistables(executor, scope, *main_program, "", param_filename);
return main_program; return main_program;
} }
......
...@@ -26,12 +26,18 @@ namespace inference { ...@@ -26,12 +26,18 @@ namespace inference {
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const framework::ProgramDesc& main_program); const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor, std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope, framework::Scope& scope,
const std::string& dirname); const std::string& dirname);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& prog_filename,
const std::string& param_filename);
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -5,25 +5,30 @@ function(inference_test TARGET_NAME) ...@@ -5,25 +5,30 @@ function(inference_test TARGET_NAME)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests) set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
set(arg_list "")
if(inference_test_ARGS) if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS}) foreach(arg ${inference_test_ARGS})
cc_test(test_inference_${TARGET_NAME}_${arg} list(APPEND arg_list "_${arg}")
SRCS test_inference_${TARGET_NAME}.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}_${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME}_${arg}
PROPERTIES DEPENDS test_${TARGET_NAME})
endforeach() endforeach()
else() else()
cc_test(test_inference_${TARGET_NAME} list(APPEND arg_list "_")
endif()
foreach(arg ${arg_list})
string(REGEX REPLACE "^_$" "" arg "${arg}")
cc_test(test_inference_${TARGET_NAME}${arg}
SRCS test_inference_${TARGET_NAME}.cc SRCS test_inference_${TARGET_NAME}.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}.inference.model) ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
set_tests_properties(test_inference_${TARGET_NAME} set_tests_properties(test_inference_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME}) PROPERTIES DEPENDS test_${TARGET_NAME})
endif() endforeach()
endfunction(inference_test) endfunction(inference_test)
inference_test(recognize_digits ARGS mlp) inference_test(fit_a_line)
inference_test(image_classification ARGS vgg resnet) inference_test(image_classification ARGS vgg resnet)
inference_test(label_semantic_roles) inference_test(label_semantic_roles)
inference_test(recognize_digits ARGS mlp)
inference_test(recommender_system)
inference_test(rnn_encoder_decoder)
inference_test(understand_sentiment)
inference_test(word2vec)
...@@ -30,6 +30,15 @@ void SetupTensor(paddle::framework::LoDTensor& input, ...@@ -30,6 +30,15 @@ void SetupTensor(paddle::framework::LoDTensor& input,
} }
} }
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
std::vector<T>& data) {
CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
}
template <typename T> template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input, void SetupLoDTensor(paddle::framework::LoDTensor& input,
paddle::framework::LoD& lod, paddle::framework::LoD& lod,
...@@ -37,7 +46,18 @@ void SetupLoDTensor(paddle::framework::LoDTensor& input, ...@@ -37,7 +46,18 @@ void SetupLoDTensor(paddle::framework::LoDTensor& input,
T upper) { T upper) {
input.set_lod(lod); input.set_lod(lod);
int dim = lod[0][lod[0].size() - 1]; int dim = lod[0][lod[0].size() - 1];
SetupTensor(input, {dim, 1}, lower, upper); SetupTensor<T>(input, {dim, 1}, lower, upper);
}
template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
paddle::framework::LoD lod,
std::vector<T>& data) {
const size_t level = lod.size() - 1;
CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
input.set_lod(lod);
SetupTensor<T>(input, dims, data);
} }
template <typename T> template <typename T>
...@@ -64,20 +84,35 @@ void CheckError(paddle::framework::LoDTensor& output1, ...@@ -64,20 +84,35 @@ void CheckError(paddle::framework::LoDTensor& output1,
count++; count++;
} }
} }
EXPECT_EQ(count, 0) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place, typename T> template <typename Place, bool IsCombined = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) { std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor and scope // 1. Define place, executor, scope
auto place = Place(); auto place = Place();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load all parameters from file // 2. Initialize the inference_program and load parameters
auto inference_program = paddle::inference::Load(executor, *scope, dirname); std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (IsCombined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// Users are free to specify different filename
// (provided: the filenames are changed in the python api as well: io.py)
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
// 3. Get the feed_target_names and fetch_target_names // 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, fit_a_line) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
// The second dim of the input tensor should be 13
// The input data should be >= 0
int64_t batch_size = 10;
SetupTensor<float>(
input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
...@@ -29,11 +29,15 @@ TEST(inference, image_classification) { ...@@ -29,11 +29,15 @@ TEST(inference, image_classification) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices // 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0]. // which should be in the range [0.0, 1.0].
SetupTensor<float>( SetupTensor<float>(input,
input, {1, 3, 32, 32}, static_cast<float>(0), static_cast<float>(1)); {batch_size, 3, 32, 32},
static_cast<float>(0),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
...@@ -42,8 +46,7 @@ TEST(inference, image_classification) { ...@@ -42,8 +46,7 @@ TEST(inference, image_classification) {
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>( TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -52,8 +55,7 @@ TEST(inference, image_classification) { ...@@ -52,8 +55,7 @@ TEST(inference, image_classification) {
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>( TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -58,8 +58,7 @@ TEST(inference, label_semantic_roles) { ...@@ -58,8 +58,7 @@ TEST(inference, label_semantic_roles) {
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>( TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod(); LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
...@@ -69,8 +68,7 @@ TEST(inference, label_semantic_roles) { ...@@ -69,8 +68,7 @@ TEST(inference, label_semantic_roles) {
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>( TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod(); LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
......
...@@ -29,6 +29,50 @@ TEST(inference, recognize_digits) { ...@@ -29,6 +29,50 @@ TEST(inference, recognize_digits) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices // 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(input,
{batch_size, 1, 28, 28},
static_cast<float>(-1),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
TEST(inference, recognize_digits_combine) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0]. // which should be in the range [-1.0, 1.0].
...@@ -42,7 +86,7 @@ TEST(inference, recognize_digits) { ...@@ -42,7 +86,7 @@ TEST(inference, recognize_digits) {
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>( TestInference<paddle::platform::CPUPlace, true>(
dirname, cpu_feeds, cpu_fetchs1); dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
...@@ -52,7 +96,7 @@ TEST(inference, recognize_digits) { ...@@ -52,7 +96,7 @@ TEST(inference, recognize_digits) {
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>( TestInference<paddle::platform::CUDAPlace, true>(
dirname, cpu_feeds, cpu_fetchs2); dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, recommender_system) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
category_id, movie_title;
// Use the first data from paddle.dataset.movielens.test() as input
std::vector<int64_t> user_id_data = {1};
SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
std::vector<int64_t> gender_id_data = {1};
SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
std::vector<int64_t> age_id_data = {0};
SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
std::vector<int64_t> job_id_data = {10};
SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
std::vector<int64_t> movie_id_data = {783};
SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
std::vector<int64_t> category_id_data = {10, 8, 9};
SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&user_id);
cpu_feeds.push_back(&gender_id);
cpu_feeds.push_back(&age_id);
cpu_feeds.push_back(&job_id);
cpu_feeds.push_back(&movie_id);
cpu_feeds.push_back(&category_id);
cpu_feeds.push_back(&movie_title);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, rnn_encoder_decoder) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor word_data, trg_word;
paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(
word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(
trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word_data);
cpu_feeds.push_back(&trg_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, understand_sentiment) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor words;
paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&words);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
TEST(inference, word2vec) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
paddle::framework::LoD lod{{0, 1}};
int64_t dict_size = 2072; // Hard-coding the size of dictionary
SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&first_word);
cpu_feeds.push_back(&second_word);
cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.lod();
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.lod();
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
...@@ -101,9 +101,9 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -101,9 +101,9 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr, .stream()>>>(
param_data, moment_data, grad_width, grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
epsilon); param_data, moment_data, grad_width, epsilon);
} }
}; };
......
...@@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data(); rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
} else { } else {
rows = grad_merge.mutable_rows()->data(); rows = grad_merge.mutable_rows()->data();
} }
......
...@@ -58,8 +58,8 @@ class CompareOpInferShape : public framework::InferShapeBase { ...@@ -58,8 +58,8 @@ class CompareOpInferShape : public framework::InferShapeBase {
comment.type); comment.type);
auto dim_x = context->GetInputDim("X"); auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Y"); auto dim_y = context->GetInputDim("Y");
PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y), PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(),
"The number of elements in X and Y should be same"); "The size of dim_y should not be greater than dim_x's.");
context->SetOutputDim("Out", context->GetInputDim("X")); context->SetOutputDim("Out", context->GetInputDim("X"));
context->ShareLoD("X", "Out"); context->ShareLoD("X", "Out");
......
...@@ -62,7 +62,7 @@ class CompareOpKernel ...@@ -62,7 +62,7 @@ class CompareOpKernel
z->mutable_data<T>(context.GetPlace()); z->mutable_data<T>(context.GetPlace());
int axis = context.Attr<int>("axis"); int axis = context.Attr<int>("axis");
ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis, ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
z); Functor(), z);
} }
}; };
......
...@@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> { ...@@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>( MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank, num_tokens, tokens, num_seq,
merge_repeated, dev_out_lod0_ptr, output_data); input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
dev_out_lod0_ptr, output_data);
// set output lod // set output lod
std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename Functor>
class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override {
auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
"Cannot get output tensor Out, variable name = %s",
context.op().Output("Out"));
int axis = context.Attr<int>("axis");
bool exclusive = context.Attr<bool>("exclusive");
bool reverse = context.Attr<bool>("reverse");
auto x_dims = X.dims();
if (axis == -1) {
axis = x_dims.size() - 1;
}
PADDLE_ENFORCE_LT(
axis, x_dims.size(),
"axis should be less than the dimensiotn of the input tensor");
Out.mutable_data<T>(context.GetPlace());
int pre = 1;
int post = 1;
int mid = x_dims[axis];
for (int i = 0; i < axis; ++i) {
pre *= x_dims[i];
}
for (int i = axis + 1; i < x_dims.size(); ++i) {
post *= x_dims[i];
}
auto x = framework::EigenVector<T>::Flatten(X);
auto out = framework::EigenVector<T>::Flatten(Out);
auto* place =
context.template device_context<DeviceContext>().eigen_device();
using IndexT = Eigen::DenseIndex;
if (pre == 1) {
if (post == 1) {
ComputeImp(*place, Eigen::DSizes<IndexT, 1>(mid), x, out,
/* axis= */ 0, reverse, exclusive);
} else {
ComputeImp(*place, Eigen::DSizes<IndexT, 2>(mid, post), x, out,
/* axis= */ 0, reverse, exclusive);
}
} else {
if (post == 1) {
ComputeImp(*place, Eigen::DSizes<IndexT, 2>(pre, mid), x, out,
/* axis= */ 1, reverse, exclusive);
} else {
ComputeImp(*place, Eigen::DSizes<IndexT, 3>(pre, mid, post), x, out,
/* axis= */ 1, reverse, exclusive);
}
}
}
private:
template <typename Device, typename Dim, typename X, typename Out>
void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis,
bool reverse, bool exclusive) const {
if (!reverse) {
out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive);
} else {
std::array<bool, Dim::count> rev;
rev.fill(false);
rev[axis] = reverse;
out.reshape(dims).device(d) =
Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev);
}
}
};
template <typename T>
struct CumsumFunctor {
using ELEMENT_TYPE = T;
template <typename X>
const typename X::TensorScanSumOp operator()(X x, int axis,
bool exclusive) const {
return x.cumsum(axis, exclusive);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/cum_op.h"
namespace paddle {
namespace operators {
class CumOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Cumsum operator");
AddOutput("Out", "Output of Cumsum operator");
AddAttr<int>("axis",
"(int, default -1). The dimenstion to accumulate along. "
"-1 means the last dimenstion")
.SetDefault(-1)
.EqualGreaterThan(-1);
AddAttr<bool>("exclusive",
"bool, default false). Whether to perform exclusive cumsum")
.SetDefault(false);
AddAttr<bool>("reverse",
"bool, default false). If true, the cumsum is performed in "
"the reversed direction")
.SetDefault(false);
AddComment(R"DOC(
The cumulative sum of the elements along a given axis.
By default, the first element of the result is the same of the first element of
the input. If exlusive is true, the first element of the result is 0.
)DOC");
}
};
class CumsumGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("cumsum");
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetAttr("axis", Attr<int>("axis"));
grad_op->SetAttr("reverse", !Attr<bool>("reverse"));
grad_op->SetAttr("exclusive", Attr<bool>("exclusive"));
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/cum_op.h"
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
...@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
AddFunctor<T>(), z);
} }
}; };
......
...@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
DivFunctor<T>(), z);
} }
}; };
......
...@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MaxFunctor<T>(), z);
} }
}; };
......
...@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MinFunctor<T>(), z);
} }
}; };
......
...@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MulFunctor<T>(), z);
} }
}; };
......
...@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T, ...@@ -365,10 +365,10 @@ template <typename Functor, typename DeviceContext, typename T,
typename OutType = T> typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext& ctx, void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* x,
const framework::Tensor* y, int axis, const framework::Tensor* y, int axis, Functor func,
framework::Tensor* z) { framework::Tensor* z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor( TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), Functor()); x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
......
...@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
PowFunctor<T>(), z);
} }
}; };
......
...@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z); ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
SubFunctor<T>(), z);
} }
}; };
......
...@@ -21,13 +21,6 @@ using Tensor = framework::Tensor; ...@@ -21,13 +21,6 @@ using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout; using DataLayout = framework::DataLayout;
template <typename T>
using EigenMatrixMapRowMajor = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
template <typename T>
using ConstEigenMatrixMapRowMajor = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
class LayerNormOp : public framework::OperatorWithKernel { class LayerNormOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Layer Normalization. Layer Normalization.
Layer Norm has been implemented as discussed in the paper: Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450 https://arxiv.org/abs/1607.06450
... ...
...@@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450 ...@@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450
} }
}; };
template <typename T>
class LayerNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto *output = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
output->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
auto squre = [](T ele) { return ele * ele; };
auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
mean_map = input_map.rowwise().mean();
var_map = (input_map - mean_map.replicate(1, right))
.unaryExpr(squre)
.rowwise()
.mean()
.unaryExpr(add_epslion);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto inv_std = var_map.unaryExpr(inv_std_func);
if (scale && bias) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1)) +
bias_map.replicate(left, 1);
} else if (scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1));
} else if (bias) {
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right)) +
bias_map.replicate(left, 1);
} else {
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right));
}
}
};
class LayerNormGradOp : public framework::OperatorWithKernel { class LayerNormGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -193,8 +116,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { ...@@ -193,8 +116,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
// check input // check input
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null."); "Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"), PADDLE_ENFORCE(ctx->HasInput("Mean"),
"Input(Mean) of LayerNormOp should not be null."); "Input(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Variance"), PADDLE_ENFORCE(ctx->HasInput("Variance"),
...@@ -237,125 +158,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { ...@@ -237,125 +158,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
} }
}; };
template <typename T>
class LayerNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
d_bias_map = d_y_map.colwise().sum();
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
auto d_scale_map =
EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map =
((x_map - mean_map.replicate(1, right))
.cwiseProduct(
var_map.unaryExpr(inv_std_func).replicate(1, right))
.cwiseProduct(d_y_map))
.colwise()
.sum();
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
auto triple_product_func = [](T ele) { return ele * ele * ele; };
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): these code can be refined
if (d_scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1));
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1))
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
} else {
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map);
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -363,8 +165,9 @@ namespace ops = paddle::operators; ...@@ -363,8 +165,9 @@ namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp); layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
layer_norm, layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>); ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
layer_norm_grad, layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>); ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/layer_norm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -16,19 +16,222 @@ limitations under the License. */ ...@@ -16,19 +16,222 @@ limitations under the License. */
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/elementwise_op_function.h"
#include "paddle/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SubAndSquareFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
};
template <typename T>
struct DivAndSqrtFunctor {
explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
inline HOSTDEVICE T operator()(T a, T b) const {
return a / (sqrt(b + epsilon_));
}
private:
T epsilon_;
};
template <typename T>
struct MulFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
};
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename T>
struct SubFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
};
template <typename T>
struct MulInvVarFunctor {
inline HOSTDEVICE T operator()(T a, T b) const {
return a * std::sqrt(1.0 / b);
}
};
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> { class LayerNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x.dims();
y->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right});
x.Resize(matrix_shape);
Tensor out;
out.ShareDataWith(*y);
out.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseMean<DeviceContext, T> row_mean;
// get mean
row_mean(dev_ctx, x, mean);
// get variance
ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
row_mean(dev_ctx, out, var);
// get x_norm
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, &out, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
if (scale) {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
}
if (bias) {
ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
}
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> { class LayerNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y");
auto *mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto &x_dims = x.dims();
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right});
d_y.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
math::ColwiseSum<DeviceContext, T> colwise_sum;
Tensor temp;
Tensor temp_norm;
if (d_scale || d_x) {
x.Resize(matrix_shape);
temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
if (!(bias && scale)) {
temp_norm.ShareDataWith(*y);
temp_norm.Resize(matrix_shape);
} else {
temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
// get x_norm
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
}
}
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
colwise_sum(dev_ctx, d_y, d_bias);
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
colwise_sum(dev_ctx, temp, d_scale);
}
if (d_x) {
framework::DDim vec_shape({left});
d_x->mutable_data<T>(ctx.GetPlace());
auto dx_dim = d_x->dims();
Tensor temp_vec;
temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
math::RowwiseMean<DeviceContext, T> row_mean;
if (d_scale) {
// dy_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, temp, &temp_vec);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
// dy_var_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
} else {
// dy_dx
framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, d_y, &temp_vec);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
// dy_var_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
}
// dy_var_dx
row_mean(dev_ctx, temp, &temp_vec);
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, d_x, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
d_x->Resize(dx_dim);
}
}
}; };
} // namespace operators } // namespace operators
......
...@@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place, // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
ids_data, ids_dim[0] * sizeof(int64_t), stream); ids_data, ids_dim[0] * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
......
...@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>; ...@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>; template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>; template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()( ...@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
vector->data<double>()); vector->data<double>());
} }
template struct RowwiseSum<platform::CUDADeviceContext, float>;
// template struct RowwiseSum<platform::CUDADeviceContext, double>;
// TODO(zcd): Following ColwiseSum format, need to confirm.
// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
// and only failed for this case. So reimplemented it.
template <>
void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
const platform::CUDADeviceContext& context, const framework::Tensor& input,
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
framework::Tensor one;
one.mutable_data<double>({size}, context.GetPlace());
SetConstant<platform::CUDADeviceContext, double> set;
set(context, &one, static_cast<double>(1.0));
gemv<platform::CUDADeviceContext, double>(
context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
1.0, one.data<double>(), input.data<double>(), 0.0,
vector->data<double>());
}
template struct RowwiseMean<platform::CUDADeviceContext, float>;
template struct RowwiseMean<platform::CUDADeviceContext, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -128,6 +128,18 @@ struct ColwiseSum { ...@@ -128,6 +128,18 @@ struct ColwiseSum {
framework::Tensor* vec); framework::Tensor* vec);
}; };
template <typename DeviceContext, typename T>
struct RowwiseSum {
void operator()(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* vec);
};
template <typename DeviceContext, typename T>
struct RowwiseMean {
void operator()(const DeviceContext& context, const framework::Tensor& input,
framework::Tensor* vec);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> { ...@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
} }
}; };
template <typename DeviceContext, typename T>
void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
class RowwiseMean<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
T sum = 0;
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
sum += in_buf[i * size + j];
}
out_buf[i] = sum * inv_size;
}
}
};
template <typename DeviceContext, typename T>
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* out) {
auto in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
// rowwise-sum can be easily implemented. General reduce has a huge overhead in
// CPU
template <typename T>
class RowwiseSum<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
auto& in_dims = input.dims();
PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
auto height = in_dims[0];
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
T sum = 0;
for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
sum += in_buf[i * size + j];
}
out_buf[i] = sum;
}
}
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -128,7 +128,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> { ...@@ -128,7 +128,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
// concat rows // concat rows
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); in2_rows.Extend(in1_rows.begin(), in1_rows.end());
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
......
...@@ -126,7 +126,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -126,7 +126,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddTensorKernel< SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.cuda_data(), out_data, in1_row_numel); in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output); auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2); auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
...@@ -153,7 +154,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -153,7 +154,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
// concat rows // concat rows
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); if (in1_rows.size()) {
in2_rows.Extend(in1_rows.begin(), in1_rows.end());
}
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
...@@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddToTensorKernel< SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel); in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
in1_row_numel);
} }
}; };
...@@ -283,9 +287,10 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -283,9 +287,10 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
MergeAddKernel< MergeAddKernel<
T, 256><<<grid1, threads, 0, T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input_rows.cuda_data(), out_data, .stream()>>>(
out.mutable_rows()->cuda_data(), input_data, input_rows.CUDAData(context.GetPlace()), out_data,
out.rows().size(), input_width); out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width);
return out; return out;
} }
}; };
......
...@@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> { ...@@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& src, const framework::Tensor& src,
framework::Vector<size_t> index_lod, framework::Tensor& dst, framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) { bool is_src_index) {
size_t* index = index_lod.cuda_data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2, PADDLE_ENFORCE_EQ(src_dims.size(), 2,
...@@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> { ...@@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
dim3 grid(8, 1); dim3 grid(8, 1);
auto stream = context.stream(); auto stream = context.stream();
CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>( CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
src_data, dst_data, index, height, width, is_src_index); src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
width, is_src_index);
} }
}; };
......
...@@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} }
} }
...@@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} }
} }
......
...@@ -73,7 +73,8 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> { ...@@ -73,7 +73,8 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
dim3 grid(num_seq, 1); dim3 grid(num_seq, 1);
auto stream = context.stream(); auto stream = context.stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>( KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.cuda_data(), out_data, max_index, num_seq, dim); in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
num_seq, dim);
} }
}; };
......
...@@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<< SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width); seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
scales, seq_width);
} }
}; };
......
...@@ -79,9 +79,6 @@ inline void CopyOrShare(const framework::Variable &src, ...@@ -79,9 +79,6 @@ inline void CopyOrShare(const framework::Variable &src,
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod()); dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else { } else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>()); Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
framework::LoD lod(src.Get<LoDTensor>().lod());
lod.CopyToPeer(dst_place);
dst->GetMutable<LoDTensor>()->set_lod(lod);
} }
} else if (src.IsType<SelectedRows>()) { } else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>(); auto &src_sr = src.Get<SelectedRows>();
...@@ -92,9 +89,6 @@ inline void CopyOrShare(const framework::Variable &src, ...@@ -92,9 +89,6 @@ inline void CopyOrShare(const framework::Variable &src,
dst_sr->set_rows(src_sr.rows()); dst_sr->set_rows(src_sr.rows());
} else { } else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
framework::Vector<int64_t> lod(src_sr.rows());
lod.CopyToPeer(dst_place);
dst_sr->set_rows(lod);
} }
} else { } else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
...@@ -152,9 +146,6 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -152,9 +146,6 @@ class ParallelDoOp : public framework::OperatorBase {
auto *sub_scope = sub_scopes[i]; auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>(); auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::Copy(src, place, dst); framework::Copy(src, place, dst);
framework::LoD lod(src.lod());
lod.CopyToPeer(place);
dst->set_lod(lod);
} }
} }
WaitOnPlaces(places); WaitOnPlaces(places);
......
...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T> ...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.cuda_data(); size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
if (future_context <= 32) { if (future_context <= 32) {
...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T> ...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.cuda_data(); size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
auto &device_ctx = context.cuda_device_context(); auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;
......
...@@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
// Copy LoD to GPU // Copy LoD to GPU
auto lod0 = lod[0]; auto lod0 = lod[0];
auto lod_len = lod0.size(); auto lod_len = lod0.size();
thrust::device_vector<size_t> dev_in_lod = lod0; const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
// Calc output LoD // Calc output LoD
thrust::device_vector<size_t> dev_out_lod(lod_len); thrust::device_vector<size_t> dev_out_lod(lod_len);
......
...@@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
dim3 grid(1, in_rows.size()); dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel< SparseSGDFunctorKernel<
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>( T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data, in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
in_row_numel); out_data, in_row_numel);
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
......
...@@ -137,8 +137,8 @@ class TargetAssignKernel : public framework::OpKernel<T> { ...@@ -137,8 +137,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]); PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
} }
size_t* gt_lod_data = gt_lod.data(ctx.GetPlace()); size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
size_t* neg_lod_data = neg_lod.data(ctx.GetPlace()); size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data, TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
gt_lod_data, background_label, num, gt_lod_data, background_label, num,
......
...@@ -7,7 +7,3 @@ if(WITH_PYTHON) ...@@ -7,7 +7,3 @@ if(WITH_PYTHON)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID) endif(NOT APPLE AND NOT ANDROID)
endif(WITH_PYTHON) endif(WITH_PYTHON)
if(WITH_DOC)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
endif(WITH_DOC)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <sstream> // std::stringstream
#include <string>
#include "paddle/framework/op_info.h"
#include "paddle/framework/op_registry.h"
#include "paddle/pybind/pybind.h"
std::string Escape(const std::string& s) {
std::string r;
for (size_t i = 0; i < s.size(); i++) {
switch (s[i]) {
case '\"':
r += "\\\"";
break;
case '\\':
r += "\\\\";
break;
case '\n':
r += "\\n";
break;
case '\t':
r += "\\t";
case '\r':
break;
default:
r += s[i];
break;
}
}
return r;
}
std::string AttrType(paddle::framework::proto::AttrType at) {
switch (at) {
case paddle::framework::proto::INT:
return "int";
case paddle::framework::proto::FLOAT:
return "float";
case paddle::framework::proto::STRING:
return "string";
case paddle::framework::proto::BOOLEAN:
return "bool";
case paddle::framework::proto::INTS:
return "int array";
case paddle::framework::proto::FLOATS:
return "float array";
case paddle::framework::proto::STRINGS:
return "string array";
case paddle::framework::proto::BOOLEANS:
return "bool array";
case paddle::framework::proto::BLOCK:
return "block id";
case paddle::framework::proto::LONG:
return "long";
}
return "UNKNOWN"; // not possible
}
void PrintVar(const paddle::framework::proto::OpProto::Var& v,
std::stringstream& ss) {
ss << " { "
<< "\n"
<< " \"name\" : \"" << Escape(v.name()) << "\",\n"
<< " \"comment\" : \"" << Escape(v.comment()) << "\",\n"
<< " \"duplicable\" : " << v.duplicable() << ",\n"
<< " \"intermediate\" : " << v.intermediate() << "\n"
<< " },";
}
void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
std::stringstream& ss) {
ss << " { "
<< "\n"
<< " \"name\" : \"" << Escape(a.name()) << "\",\n"
<< " \"type\" : \"" << AttrType(a.type()) << "\",\n"
<< " \"comment\" : \"" << Escape(a.comment()) << "\",\n"
<< " \"generated\" : " << a.generated() << "\n"
<< " },";
}
void PrintOpProto(const std::string& type,
const paddle::framework::OpInfo& opinfo,
std::stringstream& ss) {
std::cerr << "Processing " << type << "\n";
const paddle::framework::proto::OpProto* p = opinfo.proto_;
if (p == nullptr) {
return; // It is possible that an operator doesn't have OpProto.
}
ss << "{\n"
<< " \"type\" : \"" << Escape(p->type()) << "\",\n"
<< " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
ss << " \"inputs\" : [ "
<< "\n";
for (int i = 0; i < p->inputs_size(); i++) {
PrintVar(p->inputs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ], "
<< "\n";
ss << " \"outputs\" : [ "
<< "\n";
for (int i = 0; i < p->outputs_size(); i++) {
PrintVar(p->outputs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ], "
<< "\n";
ss << " \"attrs\" : [ "
<< "\n";
for (int i = 0; i < p->attrs_size(); i++) {
PrintAttr(p->attrs(i), ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << " ] "
<< "\n";
ss << "},";
}
int main() {
std::stringstream ss;
ss << "[\n";
for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
PrintOpProto(iter.first, iter.second, ss);
}
ss.seekp(-1, ss.cur); // remove the trailing comma
ss << "]\n";
std::cout << ss.str();
}
...@@ -118,8 +118,6 @@ EOF ...@@ -118,8 +118,6 @@ EOF
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py
make -j `nproc` paddle_python make -j `nproc` paddle_python
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
make -j `nproc` print_operators_doc
paddle/pybind/print_operators_doc > doc/en/html/operators.json
popd popd
fi fi
......
...@@ -10,8 +10,6 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON ...@@ -10,8 +10,6 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py
make -j `nproc` paddle_python make -j `nproc` paddle_python
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
make -j `nproc` print_operators_doc
paddle/pybind/print_operators_doc > doc/en/html/operators.json
# check websites for broken links # check websites for broken links
linkchecker doc/en/html/index.html linkchecker doc/en/html/index.html
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
std::vector<char*> new_argv; std::vector<char*> new_argv;
std::string gflags_env; std::string gflags_env;
for (int i = 0; i < argc; ++i) { for (int i = 0; i < argc; ++i) {
...@@ -35,7 +36,6 @@ int main(int argc, char** argv) { ...@@ -35,7 +36,6 @@ int main(int argc, char** argv) {
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
testing::InitGoogleTest(&argc, argv);
paddle::memory::Used(paddle::platform::CPUPlace()); paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -29,7 +29,7 @@ import optimizer ...@@ -29,7 +29,7 @@ import optimizer
import learning_rate_decay import learning_rate_decay
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace from core import LoDTensor, CPUPlace, CUDAPlace
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler
...@@ -41,11 +41,26 @@ import profiler ...@@ -41,11 +41,26 @@ import profiler
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay', 'io',
'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'initializer',
'ParamAttr' 'layers',
'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler', 'nets',
'memory_optimize', 'profiler' 'optimizer',
'learning_rate_decay',
'backward',
'regularizer',
'LoDTensor',
'CPUPlace',
'CUDAPlace',
'Tensor',
'ParamAttr',
'WeightNormParamAttr',
'DataFeeder',
'clip',
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'memory_optimize',
'profiler',
] ]
......
...@@ -300,6 +300,9 @@ class DistributeTranspiler: ...@@ -300,6 +300,9 @@ class DistributeTranspiler:
pass pass
return orig_shape return orig_shape
def _op_input_var(self, op, varname):
pass
def _is_op_on_pserver(self, endpoint, all_ops, idx): def _is_op_on_pserver(self, endpoint, all_ops, idx):
""" """
Recursively check if the op need to run on current server. Recursively check if the op need to run on current server.
...@@ -309,44 +312,51 @@ class DistributeTranspiler: ...@@ -309,44 +312,51 @@ class DistributeTranspiler:
p.name for p in self.param_grad_ep_mapping[endpoint]["params"] p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
] ]
op = all_ops[idx] op = all_ops[idx]
if op.inputs.has_key("Param"): input_names = set(op.input_names)
if op.inputs["Param"].name in param_names: # TODO(typhoonzero): using Param and Grad input name to identify
# that the operator is an optimization operator, need a better way.
if "Param" in input_names:
if op.input("Param")[0] in param_names:
return True return True
else: else:
for n in param_names: for n in param_names:
if same_or_split_var(n, op.inputs[ if same_or_split_var(n, op.input("Param")[0]) \
"Param"].name) and n != op.inputs["Param"].name: and n != op.input("Param")[0]:
return True return True
return False return False
else: else:
j = idx - 1 j = idx - 1
while j >= 0: while j >= 0:
prev_op = all_ops[j] prev_op = all_ops[j]
prev_output_names = [o.name for o in prev_op.outputs.values()] # prev_output_names = [o.name for o in prev_op.outputs.values()]
prev_input_names = [o.name for o in prev_op.inputs.values()] # prev_input_names = [o.name for o in prev_op.inputs.values()]
# NOTE(typhoonzero): consider list input/output
prev_output_names = prev_op.desc.output_arg_names()
prev_input_names = prev_op.desc.input_arg_names()
found1 = False found1 = False
found2 = False found2 = False
for _, v in op.inputs.iteritems(): for varname in op.desc.input_arg_names():
if v.name in prev_output_names: if varname in prev_output_names:
found1 = self._is_op_on_pserver(endpoint, all_ops, j) found1 = self._is_op_on_pserver(endpoint, all_ops, j)
# later ops may produce output for prev op's next batch use. # later ops may produce output for prev op's next batch use.
for _, v in op.outputs.iteritems(): for varname in op.desc.output_arg_names():
if v.name in prev_input_names: if varname in prev_input_names:
found2 = self._is_op_on_pserver(endpoint, all_ops, j) found2 = self._is_op_on_pserver(endpoint, all_ops, j)
if found1 or found2: if found1 or found2:
return True return True
j -= 1 j -= 1
return False return False
def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint): def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
program = optimize_block.program
new_inputs = dict() new_inputs = dict()
# update param/grad shape first, then other inputs like # update param/grad shape first, then other inputs like
# moment can use the updated shape # moment can use the updated shape
for key, var in opt_op.inputs.iteritems(): for key in opt_op.input_names:
if key == "Grad": if key == "Grad":
grad_block = None grad_block = None
for g in self.param_grad_ep_mapping[endpoint]["grads"]: for g in self.param_grad_ep_mapping[endpoint]["grads"]:
if same_or_split_var(g.name, var.name): if same_or_split_var(g.name, opt_op.input(key)[0]):
grad_block = g grad_block = g
break break
if not grad_block: if not grad_block:
...@@ -362,11 +372,11 @@ class DistributeTranspiler: ...@@ -362,11 +372,11 @@ class DistributeTranspiler:
if self.trainers > 1: if self.trainers > 1:
vars2merge = self._create_var_for_trainers( vars2merge = self._create_var_for_trainers(
program.global_block(), grad_block, self.trainers) program.global_block(), grad_block, self.trainers)
program.global_block().append_op( optimize_block.append_op(
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}) outputs={"Out": merged_var})
program.global_block().append_op( optimize_block.append_op(
type="scale", type="scale",
inputs={"X": merged_var}, inputs={"X": merged_var},
outputs={"Out": merged_var}, outputs={"Out": merged_var},
...@@ -376,7 +386,7 @@ class DistributeTranspiler: ...@@ -376,7 +386,7 @@ class DistributeTranspiler:
# param is already created on global program # param is already created on global program
param_block = None param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]: for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, var.name): if same_or_split_var(p.name, opt_op.input(key)[0]):
param_block = p param_block = p
break break
if not param_block: if not param_block:
...@@ -389,11 +399,12 @@ class DistributeTranspiler: ...@@ -389,11 +399,12 @@ class DistributeTranspiler:
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
for key, var in opt_op.inputs.iteritems(): for key in opt_op.input_names:
if key in ["Param", "Grad"]: if key in ["Param", "Grad"]:
continue continue
# update accumulator variable shape # update accumulator variable shape
param_shape = new_inputs["Param"].shape param_shape = new_inputs["Param"].shape
var = program.global_block().vars[opt_op.input(key)[0]]
new_shape = self._get_optimizer_input_shape(opt_op.type, key, new_shape = self._get_optimizer_input_shape(opt_op.type, key,
var.shape, param_shape) var.shape, param_shape)
tmpvar = program.global_block().create_var( tmpvar = program.global_block().create_var(
...@@ -402,40 +413,41 @@ class DistributeTranspiler: ...@@ -402,40 +413,41 @@ class DistributeTranspiler:
dtype=var.dtype, dtype=var.dtype,
shape=new_shape) shape=new_shape)
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
# create var in pserver program global block.
# TODO(typhoonzero): put blocks in one program to avoid create two
# variables.
pserver_program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=new_shape)
# change output's ParamOut variable # change output's ParamOut variable
opt_op.outputs["ParamOut"] = new_inputs["Param"] outputs = self._get_output_map_from_op(program.global_block(), opt_op)
program.global_block().append_op( outputs["ParamOut"] = new_inputs["Param"]
optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=opt_op.outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.attrs)
def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op): def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program
# Append the ops for parameters that do not need to be optimized/updated # Append the ops for parameters that do not need to be optimized/updated
for _, var in opt_op.inputs.iteritems(): inputs = self._get_input_map_from_op(self.program.global_block().vars,
program.global_block().create_var( opt_op)
name=var.name, for var in inputs.itervalues():
persistable=var.persistable, if type(var) == list:
dtype=var.dtype, varlist = var
shape=var.shape) else:
pserver_program.global_block().create_var( varlist = [var]
name=var.name, for var in varlist:
persistable=var.persistable, if not program.global_block().vars.has_key(var.name):
dtype=var.dtype, program.global_block().create_var(
shape=var.shape) name=var.name,
program.global_block().append_op( persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
outputs = self._get_output_map_from_op(self.program.global_block().vars,
opt_op)
optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
inputs=opt_op.inputs, inputs=inputs,
outputs=opt_op.outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.attrs)
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
...@@ -465,26 +477,25 @@ class DistributeTranspiler: ...@@ -465,26 +477,25 @@ class DistributeTranspiler:
dtype=v.dtype, dtype=v.dtype,
shape=v.shape) shape=v.shape)
# step6 # step6
optimize_sub_program = Program() optimize_block = pserver_program.create_block(0)
# Iterate through the ops and append ops as needed # Iterate through the ops and append ops as needed
for idx, opt_op in enumerate(self.optimize_ops): for idx, opt_op in enumerate(self.optimize_ops):
is_op_on_pserver = self._is_op_on_pserver(endpoint, is_op_on_pserver = self._is_op_on_pserver(endpoint,
self.optimize_ops, idx) self.optimize_ops, idx)
if not is_op_on_pserver: if not is_op_on_pserver:
continue continue
if opt_op.inputs.has_key("Grad"): if "Grad" in opt_op.desc.input_arg_names():
self._append_pserver_ops(optimize_sub_program, pserver_program, self._append_pserver_ops(optimize_block, opt_op, endpoint)
opt_op, endpoint)
else: else:
self._append_pserver_non_opt_ops(optimize_sub_program, self._append_pserver_non_opt_ops(optimize_block, opt_op)
pserver_program, opt_op)
# Append the listen_and_serv op # Append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="listen_and_serv", type="listen_and_serv",
inputs={}, inputs={},
outputs={}, outputs={},
attrs={ attrs={
"OptimizeBlock": optimize_sub_program.global_block(), "OptimizeBlock": optimize_block,
"endpoint": endpoint, "endpoint": endpoint,
"ParamList": [ "ParamList": [
p.name p.name
...@@ -499,6 +510,30 @@ class DistributeTranspiler: ...@@ -499,6 +510,30 @@ class DistributeTranspiler:
pserver_program.sync_with_cpp() pserver_program.sync_with_cpp()
return pserver_program return pserver_program
def _get_input_map_from_op(self, varmap, op):
iomap = dict()
for key in op.input_names:
vars = []
for varname in op.input(key):
vars.append(varmap[varname])
if len(vars) == 1:
iomap[key] = vars[0]
else:
iomap[key] = vars
return iomap
def _get_output_map_from_op(self, varmap, op):
iomap = dict()
for key in op.output_names:
vars = []
for varname in op.output(key):
vars.append(varmap[varname])
if len(vars) == 1:
iomap[key] = vars[0]
else:
iomap[key] = vars
return iomap
def get_startup_program(self, endpoint, pserver_program): def get_startup_program(self, endpoint, pserver_program):
""" """
Get startup program for current parameter server. Get startup program for current parameter server.
...@@ -529,17 +564,21 @@ class DistributeTranspiler: ...@@ -529,17 +564,21 @@ class DistributeTranspiler:
# 2. rename op outputs # 2. rename op outputs
for op in orig_s_prog.global_block().ops: for op in orig_s_prog.global_block().ops:
new_inputs = dict()
new_outputs = dict() new_outputs = dict()
# do not append startup op if var is not on this pserver # do not append startup op if var is not on this pserver
op_on_pserver = False op_on_pserver = False
for key, var in op.outputs.iteritems(): for key in op.output_names:
newname, _ = _get_splited_name_and_shape(var.name) newname, _ = _get_splited_name_and_shape(op.output(key)[0])
if newname: if newname:
op_on_pserver = True op_on_pserver = True
new_outputs[key] = created_var_map[newname] new_outputs[key] = created_var_map[newname]
elif var.name in pserver_vars: elif op.output(key)[0] in pserver_vars:
op_on_pserver = True op_on_pserver = True
new_outputs[key] = pserver_vars[var.name] new_outputs[key] = pserver_vars[op.output(key)[0]]
# most startup program ops have no inputs
new_inputs = self._get_input_map_from_op(pserver_vars, op)
if op_on_pserver: if op_on_pserver:
if op.type in [ if op.type in [
...@@ -548,7 +587,7 @@ class DistributeTranspiler: ...@@ -548,7 +587,7 @@ class DistributeTranspiler:
op.attrs["shape"] = new_outputs["Out"].shape op.attrs["shape"] = new_outputs["Out"].shape
s_prog.global_block().append_op( s_prog.global_block().append_op(
type=op.type, type=op.type,
inputs=op.inputs, inputs=new_inputs,
outputs=new_outputs, outputs=new_outputs,
attrs=op.attrs) attrs=op.attrs)
return s_prog return s_prog
...@@ -47,27 +47,13 @@ def as_numpy(tensor): ...@@ -47,27 +47,13 @@ def as_numpy(tensor):
return [as_numpy(t) for t in tensor] return [as_numpy(t) for t in tensor]
assert isinstance(tensor, core.LoDTensor) assert isinstance(tensor, core.LoDTensor)
lod = tensor.lod() lod = tensor.lod()
tensor_data = np.array(tensor) if len(lod) > 0:
if len(lod) == 0: raise RuntimeError(
ans = tensor_data "Some of your featched tensors hold LoD information. \
else: They can not be completely cast to Python ndarray. \
raise RuntimeError("LoD Calculate lacks unit tests and buggy") Please set the parameter 'return_numpy' as 'False' to \
# elif len(lod) == 1: return LoDTensor itself directly.")
# ans = [] return np.array(tensor)
# idx = 0
# while idx < len(lod) - 1:
# ans.append(tensor_data[lod[idx]:lod[idx + 1]])
# idx += 1
# else:
# for l in reversed(lod):
# ans = []
# idx = 0
# while idx < len(l) - 1:
# ans.append(tensor_data[l[idx]:l[idx + 1]])
# idx += 1
# tensor_data = ans
# ans = tensor_data
return ans
def has_feed_operators(block, feed_targets, feed_holder_name): def has_feed_operators(block, feed_targets, feed_holder_name):
...@@ -306,7 +292,6 @@ class Executor(object): ...@@ -306,7 +292,6 @@ class Executor(object):
core.get_fetch_variable(scope, fetch_var_name, i) core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list)) for i in xrange(len(fetch_list))
] ]
if return_numpy: if return_numpy:
outs = as_numpy(outs) outs = as_numpy(outs)
return outs return outs
...@@ -740,6 +740,9 @@ class Block(object): ...@@ -740,6 +740,9 @@ class Block(object):
raise e raise e
self.desc.remove_op(start, end + 1) self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end):
return list(self.ops)[start:end]
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
......
...@@ -14,14 +14,37 @@ ...@@ -14,14 +14,37 @@
import framework import framework
import numpy as np import numpy as np
import contextlib
__all__ = [ __all__ = [
'Constant', 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
'Uniform', 'init_on_cpu'
'Normal',
'Xavier',
] ]
_force_init_on_cpu_ = False
def force_init_on_cpu():
return _force_init_on_cpu_
@contextlib.contextmanager
def init_on_cpu():
"""
Switch program with `with` statement
Examples:
>>> with init_on_cpu():
>>> step = layers.create_global_var()
"""
global _force_init_on_cpu_
pre_state = force_init_on_cpu()
_force_init_on_cpu_ = True
yield
_force_init_on_cpu_ = pre_state
class Initializer(object): class Initializer(object):
"""Base class for variable initializers """Base class for variable initializers
...@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer): ...@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
"""Implements the constant initializer """Implements the constant initializer
""" """
def __init__(self, value=0.0): def __init__(self, value=0.0, force_cpu=False):
"""Constructor for ConstantInitializer """Constructor for ConstantInitializer
Args: Args:
...@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer): ...@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
assert value is not None assert value is not None
super(ConstantInitializer, self).__init__() super(ConstantInitializer, self).__init__()
self._value = value self._value = value
self._force_cpu = force_cpu
def __call__(self, var, block): def __call__(self, var, block):
"""Add constant initialization ops for a variable """Add constant initialization ops for a variable
...@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer): ...@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": int(var.dtype),
"value": self._value "value": float(self._value),
'force_cpu': self._force_cpu or force_init_on_cpu()
}) })
var.op = op var.op = op
return op return op
......
...@@ -342,7 +342,11 @@ def save_inference_model(dirname, ...@@ -342,7 +342,11 @@ def save_inference_model(dirname,
prepend_feed_ops(inference_program, feeded_var_names) prepend_feed_ops(inference_program, feeded_var_names)
append_fetch_ops(inference_program, fetch_var_names) append_fetch_ops(inference_program, fetch_var_names)
model_file_name = dirname + "/__model__" if save_file_name == None:
model_file_name = dirname + "/__model__"
else:
model_file_name = dirname + "/__model_combined__"
with open(model_file_name, "wb") as f: with open(model_file_name, "wb") as f:
f.write(inference_program.desc.serialize_to_string()) f.write(inference_program.desc.serialize_to_string())
...@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None): ...@@ -384,7 +388,11 @@ def load_inference_model(dirname, executor, load_file_name=None):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
model_file_name = dirname + "/__model__" if load_file_name == None:
model_file_name = dirname + "/__model__"
else:
model_file_name = dirname + "/__model_combined__"
with open(model_file_name, "rb") as f: with open(model_file_name, "rb") as f:
program_desc_str = f.read() program_desc_str = f.read()
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from ..framework import Variable, unique_name from ..framework import Variable, unique_name
from layer_function_generator import OpProtoHolder from layer_function_generator import OpProtoHolder
from ..initializer import force_init_on_cpu
__all__ = ['monkey_patch_variable'] __all__ = ['monkey_patch_variable']
...@@ -36,9 +37,12 @@ def monkey_patch_variable(): ...@@ -36,9 +37,12 @@ def monkey_patch_variable():
block.append_op( block.append_op(
type="fill_constant", type="fill_constant",
outputs={'Out': [var]}, outputs={'Out': [var]},
attrs={'dtype': var.dtype, attrs={
'shape': shape, 'dtype': var.dtype,
'value': value}) 'shape': shape,
'value': value,
'force_cpu': force_init_on_cpu()
})
return var return var
def create_scalar(block, value, dtype): def create_scalar(block, value, dtype):
......
...@@ -65,6 +65,7 @@ __all__ = [ ...@@ -65,6 +65,7 @@ __all__ = [
'beam_search', 'beam_search',
'row_conv', 'row_conv',
'multiplex', 'multiplex',
'layer_norm',
] ]
...@@ -184,7 +185,7 @@ def fc(input, ...@@ -184,7 +185,7 @@ def fc(input,
helper.append_op( helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
# add bias # add bias
pre_activation = helper.append_bias_op(pre_bias) pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
# add activation # add activation
return helper.append_activation(pre_activation) return helper.append_activation(pre_activation)
...@@ -641,8 +642,8 @@ def dynamic_gru(input, ...@@ -641,8 +642,8 @@ def dynamic_gru(input,
Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
Returns: Returns:
Variable: The hidden state of GRU. The shape is (T \\times D), and lod \ Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
is the same with the input. and lod is the same with the input.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs): ...@@ -990,7 +991,7 @@ def square_error_cost(input, label, **kwargs):
label(Variable): Label tensor, has target labels. label(Variable): Label tensor, has target labels.
Returns: Returns:
Variable: The tensor variable storing the element-wise squared error Variable: The tensor variable storing the element-wise squared error \
difference of input and label. difference of input and label.
Examples: Examples:
...@@ -1214,7 +1215,7 @@ def conv2d(input, ...@@ -1214,7 +1215,7 @@ def conv2d(input,
act(str): Activation type. Default: None act(str): Activation type. Default: None
Returns: Returns:
Variable: The tensor variable storing the convolution and Variable: The tensor variable storing the convolution and \
non-linearity activation result. non-linearity activation result.
Raises: Raises:
...@@ -1565,6 +1566,102 @@ def batch_norm(input, ...@@ -1565,6 +1566,102 @@ def batch_norm(input,
return helper.append_activation(batch_norm_out) return helper.append_activation(batch_norm_out)
def layer_norm(input,
scale=True,
shift=True,
begin_norm_axis=1,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
act=None,
name=None):
"""
**Layer Normalization**
Assume feature vectors exist on dimensions
:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
along these dimensions for each feature vector :math:`a` with size
:math:`H`, then normalize each feature vector using the corresponding
statistics. After that, apply learnable gain and bias on the normalized
tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
The formula is as follows:
.. math::
\\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
\\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
Args:
input(Variable): The input tensor variable.
scale(bool): Whether to learn the adaptive gain :math:`g` after
normalization.
shift(bool): Whether to learn the adaptive bias :math:`b` after
normalization.
begin_norm_axis(bool): The normalization will be performed along
dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
epsilon(float): The small value added to the variance to prevent
division by zero.
param_attr(ParamAttr|None): The parameter attribute for the learnable
gain :math:`g`.
bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`.
act(str): Activation to be applied to the output of layer normalizaiton.
Returns:
Variable: A tensor variable with the same shape as the input.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
"""
helper = LayerHelper('layer_norm', **locals())
dtype = helper.input_dtype()
# create intput and parameters
inputs = {'X': input}
input_shape = input.shape
param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
if scale:
scale = helper.create_parameter(
attr=helper.param_attr,
shape=param_shape,
dtype=dtype,
default_initializer=Constant(1.0))
inputs['Scale'] = scale
if shift:
assert bias_attr is not False
bias = helper.create_parameter(
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
inputs['Bias'] = bias
# create output
mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
layer_norm_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="layer_norm",
inputs=inputs,
outputs={
"Y": layer_norm_out,
"Mean": mean_out,
"Variance": variance_out,
},
attrs={"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis})
return helper.append_activation(layer_norm_out)
def beam_search_decode(ids, scores, name=None): def beam_search_decode(ids, scores, name=None):
helper = LayerHelper('beam_search_decode', **locals()) helper = LayerHelper('beam_search_decode', **locals())
sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
......
...@@ -65,6 +65,8 @@ __all__ = [ ...@@ -65,6 +65,8 @@ __all__ = [
'logical_or', 'logical_or',
'logical_xor', 'logical_xor',
'logical_not', 'logical_not',
'uniform_random',
'cumsum',
] + __activations__ ] + __activations__
for _OP in set(__all__): for _OP in set(__all__):
......
...@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper ...@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
from ..framework import Variable from ..framework import Variable
from ..initializer import Constant from ..initializer import Constant, force_init_on_cpu
from ..core import DataType from ..core import DataType
import numpy import numpy
...@@ -69,12 +69,30 @@ def create_parameter(shape, ...@@ -69,12 +69,30 @@ def create_parameter(shape,
default_initializer) default_initializer)
def create_global_var(shape, value, dtype, persistable=False, name=None): def create_global_var(shape,
value,
dtype,
persistable=False,
force_cpu=False,
name=None):
"""
Create a global variable. such as global_step
Args:
shape(list[int]): shape of the variable
value(float): the value of the variable
dtype(string): element type of the parameter
persistable(bool): if this variable is persistable
force_cpu(bool): force this variable to be on CPU
Returns:
Variable: the created Variable
"""
helper = LayerHelper("global_var", **locals()) helper = LayerHelper("global_var", **locals())
var = helper.create_global_variable( var = helper.create_global_variable(
dtype=dtype, shape=shape, persistable=persistable, name=name) dtype=dtype, shape=shape, persistable=persistable, name=name)
helper.set_variable_initializer( helper.set_variable_initializer(
var, initializer=Constant(value=float(value))) var, initializer=Constant(
value=float(value), force_cpu=force_cpu))
return var return var
...@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): ...@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
dtype(np.dtype|core.DataType|str): Data type of the output tensor. dtype(np.dtype|core.DataType|str): Data type of the output tensor.
value(float): The constant value used to initialize the output tensor. value(float): The constant value used to initialize the output tensor.
out(Variable): The output tensor. out(Variable): The output tensor.
force_cpu(True|False): data should be on CPU if set true.
Returns: Returns:
Variable: The tensor variable storing the output. Variable: The tensor variable storing the output.
...@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): ...@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
'shape': shape, 'shape': shape,
'dtype': out.dtype, 'dtype': out.dtype,
'value': float(value), 'value': float(value),
'force_cpu': force_cpu 'force_cpu': force_cpu or force_init_on_cpu()
}) })
out.stop_gradient = True out.stop_gradient = True
return out return out
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import layers import layers
from framework import Variable from framework import Variable
from initializer import init_on_cpu
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
...@@ -54,11 +55,14 @@ def exponential_decay(learning_rate, ...@@ -54,11 +55,14 @@ def exponential_decay(learning_rate,
if not isinstance(global_step, Variable): if not isinstance(global_step, Variable):
raise ValueError("global_step is required for exponential_decay.") raise ValueError("global_step is required for exponential_decay.")
# update learning_rate with init_on_cpu():
div_res = global_step / decay_steps # update learning_rate
if staircase: div_res = global_step / decay_steps
div_res = layers.floor(x=div_res) if staircase:
return learning_rate * (decay_rate**div_res) div_res = layers.floor(x=div_res)
decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr
def natural_exp_decay(learning_rate, def natural_exp_decay(learning_rate,
...@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate, ...@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate,
if not isinstance(global_step, Variable): if not isinstance(global_step, Variable):
raise ValueError("global_step is required for natural_exp_decay.") raise ValueError("global_step is required for natural_exp_decay.")
div_res = global_step / decay_steps with init_on_cpu():
if staircase: div_res = global_step / decay_steps
div_res = layers.floor(x=div_res) if staircase:
return learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) div_res = layers.floor(x=div_res)
decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
return decayed_lr
def inverse_time_decay(learning_rate, def inverse_time_decay(learning_rate,
...@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate, ...@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate,
if not isinstance(global_step, Variable): if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.") raise ValueError("global_step is required for inverse_time_decay.")
div_res = global_step / decay_steps with init_on_cpu():
if staircase: div_res = global_step / decay_steps
div_res = layers.floor(x=div_res) if staircase:
div_res = layers.floor(x=div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res)
return learning_rate / (1 + decay_rate * div_res) return decayed_lr
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
...@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate, ...@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate,
if not isinstance(global_step, Variable): if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.") raise ValueError("global_step is required for inverse_time_decay.")
if cycle: with init_on_cpu():
div_res = layers.ceil(x=(global_step / decay_steps)) if cycle:
zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) div_res = layers.ceil(x=(global_step / decay_steps))
one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) zero_var = layers.fill_constant(
shape=[1], dtype='float32', value=0.0)
with layers.Switch() as switch: one_var = layers.fill_constant(
with switch.case(layers.equal(x=global_step, y=zero_var)): shape=[1], dtype='float32', value=1.0)
layers.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res with layers.Switch() as switch:
else: with switch.case(layers.equal(x=global_step, y=zero_var)):
decay_steps_var = layers.fill_constant( layers.assign(input=one_var, output=div_res)
shape=[1], dtype='float32', value=float(decay_steps)) decay_steps = decay_steps * div_res
global_step = layers.elementwise_min(x=global_step, y=decay_steps_var) else:
decay_steps_var = layers.fill_constant(
return (learning_rate - end_learning_rate) * \ shape=[1], dtype='float32', value=float(decay_steps))
((1 - global_step / decay_steps) ** power) + end_learning_rate global_step = layers.elementwise_min(
x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr
def piecewise_decay(global_step, boundaries, values): def piecewise_decay(global_step, boundaries, values):
...@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values): ...@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values):
if not isinstance(global_step, Variable): if not isinstance(global_step, Variable):
raise ValueError("global_step is required for piecewise_decay.") raise ValueError("global_step is required for piecewise_decay.")
lr = layers.create_global_var( with init_on_cpu():
shape=[1], lr = layers.create_global_var(
value=0.0, shape=[1],
dtype='float32', value=0.0,
persistable=True, dtype='float32',
name="learning_rate") persistable=True,
name="learning_rate")
with layers.Switch() as switch:
for i in range(len(boundaries)): with layers.Switch() as switch:
boundary_val = layers.fill_constant( for i in range(len(boundaries)):
shape=[1], dtype='float32', value=float(boundaries[i])) boundary_val = layers.fill_constant(
value_var = layers.fill_constant( shape=[1], dtype='float32', value=float(boundaries[i]))
shape=[1], dtype='float32', value=float(values[i])) value_var = layers.fill_constant(
with switch.case(layers.less_than(global_step, boundary_val)): shape=[1], dtype='float32', value=float(values[i]))
layers.assign(value_var, lr) with switch.case(layers.less_than(global_step, boundary_val)):
last_value_var = layers.fill_constant( layers.assign(value_var, lr)
shape=[1], dtype='float32', value=float(values[len(values) - 1])) last_value_var = layers.fill_constant(
with switch.default(): shape=[1],
layers.assign(last_value_var, lr) dtype='float32',
value=float(values[len(values) - 1]))
with switch.default():
layers.assign(last_value_var, lr)
return lr return lr
...@@ -92,14 +92,13 @@ class ControlFlowGraph(object): ...@@ -92,14 +92,13 @@ class ControlFlowGraph(object):
live_in = defaultdict(set) live_in = defaultdict(set)
live_out = defaultdict(set) live_out = defaultdict(set)
while True: while True:
for i in range(self.op_size): for i in range(self.op_size, 0, -1):
live_in[i] = set(self._live_in[i]) live_in[i] = set(self._live_in[i])
live_out[i] = set(self._live_out[i]) live_out[i] = set(self._live_out[i])
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
for s in self._successors[i]: for s in self._successors[i]:
self._live_out[i] |= self._live_in[s] self._live_out[i] |= self._live_in[s]
self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i])
if self._reach_fixed_point(live_in, live_out): if self._reach_fixed_point(live_in, live_out):
break break
......
...@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries, ...@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,
Returns: Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product Variable: A 3-D Tensor computed by multi-head scaled dot product \
attention. attention.
Raises: Raises:
...@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries, ...@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
x=product, shape=[-1, product.shape[-1]], act="softmax"), x=product, shape=[-1, product.shape[-1]], act="softmax"),
shape=product.shape) shape=product.shape)
if dropout_rate: if dropout_rate:
weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False) weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False)
ctx_multiheads = layers.matmul(weights, v) ctx_multiheads = layers.matmul(weights, v)
return __combine_heads(ctx_multiheads) return __combine_heads(ctx_multiheads)
...@@ -190,6 +190,8 @@ class Optimizer(object): ...@@ -190,6 +190,8 @@ class Optimizer(object):
# Create any accumulators # Create any accumulators
program = loss.block.program program = loss.block.program
with program_guard(program, startup_program): with program_guard(program, startup_program):
global_block = framework.default_main_program().global_block()
start = len(global_block.ops)
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
...@@ -203,19 +205,14 @@ class Optimizer(object): ...@@ -203,19 +205,14 @@ class Optimizer(object):
param_and_grad) param_and_grad)
optimize_ops.append(optimize_op) optimize_ops.append(optimize_op)
# Returned list of ops can include more ops in addition
# to optimization ops
return_ops = optimize_ops
# Get custom finish ops for subclasses # Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies # FIXME: Need to fix this once we figure out how to handle dependencies
finish_ops = self._finish_update(loss.block) self._finish_update(loss.block)
if finish_ops is not None:
return_ops += finish_ops
if self._global_step is not None: if self._global_step is not None:
return_ops.append(self._increment_global_step(loss.block)) self._increment_global_step(loss.block)
return return_ops end = len(global_block.ops)
return global_block.slice_ops(start, end)
def minimize(self, def minimize(self,
loss, loss,
......
recognize_digits_*.inference.model *.inference.model
...@@ -15,15 +15,13 @@ ...@@ -15,15 +15,13 @@
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib import contextlib
import numpy
import unittest import unittest
import math import math
import sys import sys
def main(use_cuda): def train(use_cuda, save_dirname):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
x = fluid.layers.data(name='x', shape=[13], dtype='float32') x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None) y_predict = fluid.layers.fc(input=x, size=1, act=None)
...@@ -51,14 +49,15 @@ def main(use_cuda): ...@@ -51,14 +49,15 @@ def main(use_cuda):
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print(avg_loss_value) print(avg_loss_value)
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ['x'],
[y_predict], exe)
return return
if math.isnan(float(avg_loss_value)): if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
...@@ -66,6 +65,43 @@ def main(use_cuda): ...@@ -66,6 +65,43 @@ def main(use_cuda):
avg_loss_value[0])) avg_loss_value[0]))
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension should be 2-D and the second dim is 13
# The input data should be >= 0
batch_size = 10
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
assert feed_target_names[0] == 'x'
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_x},
fetch_list=fetch_targets)
print("infer shape: ", results[0].shape)
print("infer results: ", results[0])
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "fit_a_line.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestFitALine(unittest.TestCase): class TestFitALine(unittest.TestCase):
def test_cpu(self): def test_cpu(self):
with self.program_scope_guard(): with self.program_scope_guard():
......
...@@ -18,6 +18,7 @@ import numpy as np ...@@ -18,6 +18,7 @@ import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05 import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
from paddle.v2.fluid.initializer import init_on_cpu
import contextlib import contextlib
import time import time
import unittest import unittest
...@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None): ...@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):
# TODO(qiao) # TODO(qiao)
# check other optimizers and check why out will be NAN # check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) global_step = fluid.layers.create_global_var(
shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.learning_rate_decay.exponential_decay(
learning_rate=0.0001,
global_step=global_step,
decay_steps=100000,
decay_rate=0.5,
staircase=True),
global_step=global_step)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
# TODO(qiao) # TODO(qiao)
......
...@@ -78,7 +78,7 @@ def conv_net(img, label): ...@@ -78,7 +78,7 @@ def conv_net(img, label):
return loss_net(conv_pool_2, label) return loss_net(conv_pool_2, label)
def train(nn_type, use_cuda, parallel, save_dirname): def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
return return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
...@@ -143,8 +143,10 @@ def train(nn_type, use_cuda, parallel, save_dirname): ...@@ -143,8 +143,10 @@ def train(nn_type, use_cuda, parallel, save_dirname):
avg_loss_val = numpy.array(avg_loss_set).mean() avg_loss_val = numpy.array(avg_loss_set).mean()
if float(acc_val) > 0.85: # test acc > 85% if float(acc_val) > 0.85: # test acc > 85%
if save_dirname is not None: if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ["img"], fluid.io.save_inference_model(
[prediction], exe) save_dirname, ["img"], [prediction],
exe,
save_file_name=save_param_filename)
return return
else: else:
print( print(
...@@ -156,7 +158,7 @@ def train(nn_type, use_cuda, parallel, save_dirname): ...@@ -156,7 +158,7 @@ def train(nn_type, use_cuda, parallel, save_dirname):
raise AssertionError("Loss of recognize digits is too large") raise AssertionError("Loss of recognize digits is too large")
def infer(use_cuda, save_dirname=None): def infer(use_cuda, save_dirname=None, param_filename=None):
if save_dirname is None: if save_dirname is None:
return return
...@@ -167,13 +169,14 @@ def infer(use_cuda, save_dirname=None): ...@@ -167,13 +169,14 @@ def infer(use_cuda, save_dirname=None):
# the feed_target_names (the names of variables that will be feeded # the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that # data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators). # we want to obtain data from using fetch operators).
[inference_program, feed_target_names, [inference_program, feed_target_names, fetch_targets
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
# The input's dimension of conv should be 4-D or 5-D. # The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range [-1.0, 1.0]. # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0, tensor_img = numpy.random.uniform(-1.0, 1.0,
[1, 1, 28, 28]).astype("float32") [batch_size, 1, 28, 28]).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
...@@ -183,36 +186,45 @@ def infer(use_cuda, save_dirname=None): ...@@ -183,36 +186,45 @@ def infer(use_cuda, save_dirname=None):
print("infer results: ", results[0]) print("infer results: ", results[0])
def main(use_cuda, parallel, nn_type): def main(use_cuda, parallel, nn_type, combine):
if not use_cuda and not parallel: if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model" save_dirname = "recognize_digits_" + nn_type + ".inference.model"
save_filename = None
if combine == True:
save_filename = "__params_combined__"
else: else:
save_dirname = None save_dirname = None
save_filename = None
train( train(
nn_type=nn_type, nn_type=nn_type,
use_cuda=use_cuda, use_cuda=use_cuda,
parallel=parallel, parallel=parallel,
save_dirname=save_dirname) save_dirname=save_dirname,
infer(use_cuda=use_cuda, save_dirname=save_dirname) save_param_filename=save_filename)
infer(
use_cuda=use_cuda,
save_dirname=save_dirname,
param_filename=save_filename)
class TestRecognizeDigits(unittest.TestCase): class TestRecognizeDigits(unittest.TestCase):
pass pass
def inject_test_method(use_cuda, parallel, nn_type): def inject_test_method(use_cuda, parallel, nn_type, combine):
def __impl__(self): def __impl__(self):
prog = fluid.Program() prog = fluid.Program()
startup_prog = fluid.Program() startup_prog = fluid.Program()
scope = fluid.core.Scope() scope = fluid.core.Scope()
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog): with fluid.program_guard(prog, startup_prog):
main(use_cuda, parallel, nn_type) main(use_cuda, parallel, nn_type, combine)
fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda' fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel' if use_cuda else 'cpu', 'parallel'
if parallel else 'normal') if parallel else 'normal', 'combine'
if combine else 'separate')
setattr(TestRecognizeDigits, fn, __impl__) setattr(TestRecognizeDigits, fn, __impl__)
...@@ -221,7 +233,10 @@ def inject_all_tests(): ...@@ -221,7 +233,10 @@ def inject_all_tests():
for use_cuda in (False, True): for use_cuda in (False, True):
for parallel in (False, True): for parallel in (False, True):
for nn_type in ('mlp', 'conv'): for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type) inject_test_method(use_cuda, parallel, nn_type, True)
# One unit-test for saving parameters as separate files
inject_test_method(False, False, 'mlp', False)
inject_all_tests() inject_all_tests()
......
...@@ -16,7 +16,7 @@ import math ...@@ -16,7 +16,7 @@ import math
import sys import sys
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid.core as core import paddle.v2.fluid as fluid
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
...@@ -104,7 +104,8 @@ def get_mov_combined_features(): ...@@ -104,7 +104,8 @@ def get_mov_combined_features():
CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
category_id = layers.data(name='category_id', shape=[1], dtype='int64') category_id = layers.data(
name='category_id', shape=[1], dtype='int64', lod_level=1)
mov_categories_emb = layers.embedding( mov_categories_emb = layers.embedding(
input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
...@@ -114,7 +115,8 @@ def get_mov_combined_features(): ...@@ -114,7 +115,8 @@ def get_mov_combined_features():
MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64') mov_title_id = layers.data(
name='movie_title', shape=[1], dtype='int64', lod_level=1)
mov_title_emb = layers.embedding( mov_title_emb = layers.embedding(
input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
...@@ -144,23 +146,22 @@ def model(): ...@@ -144,23 +146,22 @@ def model():
scale_infer = layers.scale(x=inference, scale=5.0) scale_infer = layers.scale(x=inference, scale=5.0)
label = layers.data(name='score', shape=[1], dtype='float32') label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=scale_infer, label=label) square_cost = layers.square_error_cost(input=scale_infer, label=label)
avg_cost = layers.mean(x=square_cost) avg_cost = layers.mean(x=square_cost)
return avg_cost return scale_infer, avg_cost
def train(use_cuda, save_dirname):
scale_infer, avg_cost = model()
# test program
test_program = fluid.default_main_program().clone()
def main():
cost = model()
sgd_optimizer = SGDOptimizer(learning_rate=0.2) sgd_optimizer = SGDOptimizer(learning_rate=0.2)
opts = sgd_optimizer.minimize(cost) opts = sgd_optimizer.minimize(avg_cost)
if USE_GPU: place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -169,6 +170,8 @@ def main(): ...@@ -169,6 +170,8 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192), paddle.dataset.movielens.train(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
feeding = { feeding = {
'user_id': 0, 'user_id': 0,
...@@ -184,7 +187,7 @@ def main(): ...@@ -184,7 +187,7 @@ def main():
def func_feed(feeding, data): def func_feed(feeding, data):
feed_tensors = {} feed_tensors = {}
for (key, idx) in feeding.iteritems(): for (key, idx) in feeding.iteritems():
tensor = core.LoDTensor() tensor = fluid.LoDTensor()
if key != "category_id" and key != "movie_title": if key != "category_id" and key != "movie_title":
if key == "score": if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype( numpy_data = np.array(map(lambda x: x[idx], data)).astype(
...@@ -211,16 +214,117 @@ def main(): ...@@ -211,16 +214,117 @@ def main():
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
for data in train_reader(): for batch_id, data in enumerate(train_reader()):
outs = exe.run(framework.default_main_program(), # train a mini-batch
outs = exe.run(program=fluid.default_main_program(),
feed=func_feed(feeding, data), feed=func_feed(feeding, data),
fetch_list=[cost]) fetch_list=[avg_cost])
out = np.array(outs[0]) out = np.array(outs[0])
if out[0] < 6.0: if (batch_id + 1) % 10 == 0:
# if avg cost less than 6.0, we think our code is good. avg_cost_set = []
exit(0) for test_data in test_reader():
avg_cost_np = exe.run(program=test_program,
feed=func_feed(feeding, test_data),
fetch_list=[avg_cost])
avg_cost_set.append(avg_cost_np[0])
break # test only 1 segment for speeding up CI
# get test avg_cost
test_avg_cost = np.array(avg_cost_set).mean()
if test_avg_cost < 6.0:
# if avg_cost less than 6.0, we think our code is good.
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, [
"user_id", "gender_id", "age_id", "job_id",
"movie_id", "category_id", "movie_title"
], [scale_infer], exe)
return
if math.isnan(float(out[0])): if math.isnan(float(out[0])):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
main() def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor
# Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id"
user_id = create_lod_tensor([[1]])
assert feed_target_names[1] == "gender_id"
gender_id = create_lod_tensor([[1]])
assert feed_target_names[2] == "age_id"
age_id = create_lod_tensor([[0]])
assert feed_target_names[3] == "job_id"
job_id = create_lod_tensor([[10]])
assert feed_target_names[4] == "movie_id"
movie_id = create_lod_tensor([[783]])
assert feed_target_names[5] == "category_id"
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
assert feed_target_names[6] == "movie_title"
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program,
feed={
feed_target_names[0]: user_id,
feed_target_names[1]: gender_id,
feed_target_names[2]: age_id,
feed_target_names[3]: job_id,
feed_target_names[4]: movie_id,
feed_target_names[5]: category_id,
feed_target_names[6]: movie_title
},
fetch_list=fetch_targets,
return_numpy=False)
print("inferred score: ", np.array(results[0]))
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the inference model
save_dirname = "recommender_system.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
if __name__ == '__main__':
main(USE_GPU)
...@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid ...@@ -18,6 +18,10 @@ import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import contextlib
import math
import sys
import unittest
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
dict_size = 30000 dict_size = 30000
...@@ -145,7 +149,7 @@ def seq_to_seq_net(): ...@@ -145,7 +149,7 @@ def seq_to_seq_net():
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
return avg_cost return avg_cost, prediction
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -163,8 +167,16 @@ def to_lodtensor(data, place): ...@@ -163,8 +167,16 @@ def to_lodtensor(data, place):
return res return res
def main(): def create_random_lodtensor(lod, place, low, high):
avg_cost = seq_to_seq_net() data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, save_dirname=None):
[avg_cost, prediction] = seq_to_seq_net()
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -174,7 +186,7 @@ def main(): ...@@ -174,7 +186,7 @@ def main():
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
place = core.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -185,6 +197,7 @@ def main(): ...@@ -185,6 +197,7 @@ def main():
word_data = to_lodtensor(map(lambda x: x[0], data), place) word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(framework.default_main_program(), outs = exe.run(framework.default_main_program(),
feed={ feed={
'source_sequence': word_data, 'source_sequence': word_data,
...@@ -192,13 +205,86 @@ def main(): ...@@ -192,13 +205,86 @@ def main():
'label_sequence': trg_word_next 'label_sequence': trg_word_next
}, },
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0]) avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val)) " avg_cost=" + str(avg_cost_val))
if math.isnan(float(avg_cost_val[0])):
sys.exit("got NaN loss, training failed.")
if batch_id > 3: if batch_id > 3:
exit(0) if save_dirname is not None:
fluid.io.save_inference_model(
save_dirname, ['source_sequence',
'target_sequence'], [prediction], exe)
return
batch_id += 1 batch_id += 1
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10]
word_data = create_random_lodtensor(lod, place, low=0, high=1)
trg_word = create_random_lodtensor(lod, place, low=0, high=1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
assert feed_target_names[0] == 'source_sequence'
assert feed_target_names[1] == 'target_sequence'
results = exe.run(inference_program,
feed={
feed_target_names[0]: word_data,
feed_target_names[1]: trg_word,
},
fetch_list=fetch_targets,
return_numpy=False)
print(results[0].lod())
np_data = np.array(results[0])
print("Inference shape: ", np_data.shape)
print("Inference results: ", np_data)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "rnn_encoder_decoder.inference.model"
train(use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestRnnEncoderDecoder(unittest.TestCase):
def test_cuda(self):
with self.scope_prog_guard():
main(use_cuda=True)
def test_cpu(self):
with self.scope_prog_guard():
main(use_cuda=False)
@contextlib.contextmanager
def scope_prog_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__': if __name__ == '__main__':
main() unittest.main()
...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid ...@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
import contextlib import contextlib
import math import math
import numpy as np
import sys import sys
...@@ -43,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, ...@@ -43,7 +44,7 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost) adam_optimizer.minimize(avg_cost)
accuracy = fluid.layers.accuracy(input=prediction, label=label) accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy return avg_cost, accuracy, prediction
def stacked_lstm_net(data, def stacked_lstm_net(data,
...@@ -81,13 +82,18 @@ def stacked_lstm_net(data, ...@@ -81,13 +82,18 @@ def stacked_lstm_net(data,
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost) adam_optimizer.minimize(avg_cost)
accuracy = fluid.layers.accuracy(input=prediction, label=label) accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy return avg_cost, accuracy, prediction
def main(word_dict, net_method, use_cuda): def create_random_lodtensor(lod, place, low, high):
if use_cuda and not fluid.core.is_compiled_with_cuda(): data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
return res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(word_dict, net_method, use_cuda, save_dirname=None):
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 5 PASS_NUM = 5
dict_dim = len(word_dict) dict_dim = len(word_dict)
...@@ -96,7 +102,7 @@ def main(word_dict, net_method, use_cuda): ...@@ -96,7 +102,7 @@ def main(word_dict, net_method, use_cuda):
data = fluid.layers.data( data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1) name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, acc_out = net_method( cost, acc_out, prediction = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
...@@ -116,6 +122,9 @@ def main(word_dict, net_method, use_cuda): ...@@ -116,6 +122,9 @@ def main(word_dict, net_method, use_cuda):
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
print("cost=" + str(cost_val) + " acc=" + str(acc_val)) print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if cost_val < 0.4 and acc_val > 0.8: if cost_val < 0.4 and acc_val > 0.8:
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ["words"],
prediction, exe)
return return
if math.isnan(float(cost_val)): if math.isnan(float(cost_val)):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
...@@ -123,6 +132,49 @@ def main(word_dict, net_method, use_cuda): ...@@ -123,6 +132,49 @@ def main(word_dict, net_method, use_cuda):
net_method.__name__)) net_method.__name__))
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10]
word_dict = paddle.dataset.imdb.word_dict()
tensor_words = create_random_lodtensor(
lod, place, low=0, high=len(word_dict) - 1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
assert feed_target_names[0] == "words"
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_words},
fetch_list=fetch_targets,
return_numpy=False)
print(results[0].lod())
np_data = np.array(results[0])
print("Inference Shape: ", np_data.shape)
print("Inference results: ", np_data)
def main(word_dict, net_method, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
save_dirname = "understand_sentiment.inference.model"
train(word_dict, net_method, use_cuda, save_dirname)
infer(use_cuda, save_dirname)
class TestUnderstandSentiment(unittest.TestCase): class TestUnderstandSentiment(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册