提交 62dc593e 编写于 作者: W wanghaox

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into hard_example

......@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif()
if (cc_library_DEPS)
# Don't need link libwarpctc.so
if ("${cc_library_DEPS};" MATCHES "warpctc;")
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
......@@ -465,10 +470,10 @@ function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
......
### Design Doc: Switch
### Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
The following example shows the usage of `fluid.switch`.
```python
a = fluid.Var(10)
b = fluid.Var(0)
switch = fluid.switch()
with switch.block():
with switch.case(fluid.less_equal(a, 10)):
fluid.print("Case 1")
with switch.case(fluid.larger(a, 0)):
fluid.print("Case 2")
with switch.default():
fluid.print("Case 3")
```
### The Semantics
1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
1. It runs the first matched case, or the default case if there is one.
1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
The above program should print and print only "Case 1".
The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
......@@ -92,11 +92,11 @@ paddle.init(
参数说明
- use_gpu: **可选,默认False**,是否启用GPU训练
- trainer_count:**必选,默认1**,当前训练任务trainer总个数
- trainer_count:**必选,默认1**,当前trainer的线程数目
- port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
......
......@@ -95,11 +95,11 @@ paddle.init(
Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job.
- trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
- num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
......
......@@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
......@@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
t.join();
delete ch;
}
TEST(Channel, SimpleUnbufferedChannelTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
std::thread t([&]() {
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 5; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
ch->Receive(&data);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked becausew of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data = 10;
ch->Send(&data);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the thread
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
// Send should block after three iterations
// since we only have three receivers.
std::thread t([&]() {
// Try to send more number of times
// than receivers
for (int i = 0; i < 4; i++) {
ch->Send(&i);
sum_send += i;
}
});
for (int i = 0; i < 3; i++) {
int recv;
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
t.join();
delete ch;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/framework/channel.h"
......@@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
virtual ~UnBuffered();
private:
UnBuffered() {}
std::mutex mu_ch_;
// Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std::recursive_mutex mu_read_, mu_write_;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_;
T* item{nullptr};
std::atomic<bool> closed_{false};
UnBuffered() : closed_(false) {}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
};
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
void UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
// If writer comes first, it should wait till a reader arrives
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
channel_lock.unlock();
cv_channel_.notify_one();
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
}
writer_found_ = false;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T>
void UnBuffered<T>::Send(T* channel_element) {}
void UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
// If reader comes first, it should wait till a writer arrives
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
if (!closed_) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
}
cv_channel_.notify_one();
}
reader_found_ = false;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template <typename T>
void UnBuffered<T>::Receive(T*) {}
void UnBuffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template <typename T>
void UnBuffered<T>::Close() {}
UnBuffered<T>::~UnBuffered() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function notifies all the readers, writers and
// the channel condition variables.
template <typename T>
UnBuffered<T>::~UnBuffered() {}
void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
cv_writer_.notify_all();
cv_channel_.notify_all();
cv_reader_.notify_all();
}
} // namespace details
} // namespace framework
......
......@@ -34,18 +34,6 @@ namespace framework {
template <typename T>
class Vector : public std::vector<T> {
public:
/* NOTE(dzhwinter):
* Data always store and modified on Host.
* If the data is modified when use cuda_data interface,
* You need to call the CopyFromCUDA explicitly to synchronize data.
*
*/
enum class kDataPosition {
kDataOnHost = 0,
kDataOnDevice = 1,
};
public:
using std::vector<T>::vector;
......@@ -55,11 +43,12 @@ class Vector : public std::vector<T> {
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
#endif
}
/* Get device vector */
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
......@@ -67,81 +56,73 @@ class Vector : public std::vector<T> {
return static_cast<T *>(cuda_ptr_);
}
/* Get host vector */
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
/* Synchronize host vector to device vector */
void CopyToCUDA();
/* Synchronize device vector to host vector */
void CopyFromCUDA();
/* Switch device vector location */
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0;
/*The DataPosition is unused now,
if we want support random access from cpu and cuda,
we need to overload all the vector method */
kDataPosition position_ = kDataPosition::kDataOnHost;
size_t cuda_size_ = 0; // device vector numel
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
if (cuda_size_ < this->size()) {
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
}
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), cuda_ctx->stream());
cuda_ctx->Wait();
cuda_size_ = this->size();
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommited cuda data.";
LOG(WARNING) << "No uncommitted cuda data.";
return;
}
this->resize(cuda_size_);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
ctx->stream());
ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
if (platform::is_cpu_place(peer_place)) {
return;
}
#ifdef PADDLE_WITH_CUDA
auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
static_cast<void *>(peer_cuda_ptr_), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr_;
cuda_ptr_ = peer_cuda_ptr;
#endif
}
......
......@@ -178,19 +178,22 @@ public:
real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>();
real* colData = NULL;
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
size_t colWidth = outputHeight * outputWidth;
// Max col matrix height 256, Max col matrix width 1024
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
// Max col matrix width 4096, Max col matrix size 4M.
size_t outputHeightSteps =
std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
size_t maxColWidth = outputHeightSteps * outputWidth;
size_t channelSteps =
std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
(size_t)1),
inputChannels / groups_);
size_t maxColHeight = channelSteps * filterHeight * filterWidth;
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
......@@ -199,7 +202,7 @@ public:
outputHeight,
outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf());
}
......@@ -209,20 +212,24 @@ public:
(outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth;
int kStride = colHeight;
int nStride = outputHeight * outputWidth;
int kStride = inputChannels / groups_ * filterHeight * filterWidth;
for (size_t i = 0; i < batchSize; i++) {
filterData = inputs[1].data<real>();
for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight;
colHeightStart += stepColHeight) {
for (size_t colWidthStart = 0; colWidthStart < colWidth;
colWidthStart += stepColWidth) {
int N = std::min(colWidth - colWidthStart, stepColWidth);
int K = std::min(colHeight - colHeightStart, stepColHeight);
for (size_t ic = 0; ic < inputChannels / groups_;
ic += channelSteps) {
int channels = std::min(inputChannels / groups_ - ic, channelSteps);
for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
int height = std::min(outputHeight - oh, outputHeightSteps);
int M = outputChannels / groups_;
int N = height * outputWidth;
int K = channels * filterHeight * filterWidth;
// im2col
im2col(inputData + g * inputOffset,
im2col(inputData,
imShape,
colData,
colShape,
......@@ -232,13 +239,12 @@ public:
paddingW(),
dilationH(),
dilationW(),
colHeightStart,
K,
colWidthStart,
channels,
oh,
height,
N);
// gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute(
false,
false,
......@@ -246,12 +252,12 @@ public:
N,
K,
1.0f,
filterData + g * filterOffset + colHeightStart,
filterData + ic * filterHeight * filterWidth,
kStride,
colData,
N,
beta_,
outputData + g * outputOffset + colWidthStart,
outputData + oh * outputWidth,
nStride);
}
beta_ = 1.0;
......@@ -266,17 +272,18 @@ public:
N,
K,
1.0f,
filterData + g * filterOffset,
filterData,
K,
inputData + g * inputOffset,
inputData,
N,
beta,
outputData + g * outputOffset,
outputData,
N);
}
inputData += inputOffset;
outputData += outputOffset;
filterData += filterOffset;
}
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
memory_.reset();
......
......@@ -111,39 +111,42 @@ public:
int paddingWidth,
int dilationHeight,
int dilationWidth,
int colHeightStart,
int colHeightSize,
int colWidthStart,
int colWidthSize) {
int inputChannels,
int colOffset,
int colOutputHeight,
int colWidth) {
int inputHeight = imShape[1];
int inputWidth = imShape[2];
int filterHeight = colShape[1];
int filterWidth = colShape[2];
int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) {
int wOffset = (colHeightStart + colh) % filterWidth;
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
for (int colw = 0; colw < colWidthSize; colw++) {
int h = (colWidthStart + colw) / outputWidth;
int w = (colWidthStart + colw) % outputWidth;
int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 ||
(imColIdx - paddingWidth) >= inputWidth) {
colData[colh * colWidthSize + colw] = static_cast<T>(0);
} else {
imRowIdx += c_im * inputHeight - paddingHeight;
imColIdx -= paddingWidth;
colData[colh * colWidthSize + colw] =
imData[imRowIdx * inputWidth + imColIdx];
for (int ic = 0; ic < inputChannels; ic++) {
for (int oh = 0; oh < colOutputHeight; oh++) {
T* dstData = colData + oh * outputWidth;
for (int fh = 0; fh < filterHeight; fh++) {
for (int fw = 0; fw < filterWidth; fw++) {
int imRowIdx = (oh + colOffset) * strideHeight +
fh * dilationHeight - paddingHeight;
if (imRowIdx < 0 || imRowIdx >= inputHeight) {
memset(dstData, 0, outputWidth * sizeof(T));
} else {
for (int ow = 0; ow < outputWidth; ow++) {
int imColIdx =
ow * strideWidth + fw * dilationWidth - paddingWidth;
if (imColIdx < 0 || imColIdx >= inputWidth) {
dstData[ow] = T(0);
} else {
dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
}
}
}
dstData += colWidth;
}
}
}
colData += filterHeight * filterWidth * colWidth;
imData += inputHeight * inputWidth;
}
}
};
......
......@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding,
dilation,
dilation,
channels,
0,
height,
0,
width);
outputHeight,
outputHeight * outputWidth);
autotest::TensorCheckEqual(*output1, *output2);
}
......
......@@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
PROPERTIES DEPENDS test_recognize_digits)
......@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, maskMatP->getWidth());
}
/* initialize the data_ */
for (size_t i = 0; i < height_; i++) {
for (size_t j = 0; j < width_; j++) {
outData[i * outStride + j] = -(real)FLT_MAX;
}
}
/* pool max one by one */
for (size_t n = 0; n < num; ++n) { // frame by frame
if (!isContiguous()) {
......@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for (size_t c = 0; c < channels; ++c) { // channel by channel
for (size_t ph = 0; ph < outputH; ++ph) {
int hstart = ph * strideH - paddingH;
int hend = std::min(hstart + sizeY, imgSizeH);
hstart = std::max(hstart, 0);
int hend = hstart + sizeY;
hstart = hstart < 0 ? 0 : hstart;
hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
for (size_t pw = 0; pw < outputW; ++pw) {
int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW);
wstart = std::max(wstart, 0);
int wend = wstart + sizeX;
wstart = wstart < 0 ? 0 : wstart;
wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
if (maskData == NULL) {
real tmp = -(real)FLT_MAX;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
outData[ph * outputW + pw] = std::max(
outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
tmp = tmp < inputData[h * imgSizeW + w]
? inputData[h * imgSizeW + w]
: tmp;
}
}
outData[ph * outputW + pw] = tmp;
} else {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
......
......@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
endif()
op_library(cond_op DEPS framework_proto tensor net_op)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/grpc_server.h"
#include "paddle/operators/detail/sendrecvop_utils.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h"
namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
}
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
var->GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW(
"VariableMessage type %d is not in "
"[LoDTensor, SelectedRows]",
var_type);
}
}
class ListenAndServOp : public framework::OperatorBase {
public:
ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
server_thread_.reset(new std::thread(RunServer, rpc_service_));
}
}
void Stop() override {
detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg);
rpc_service_->ShutDown();
server_thread_->join();
}
std::string GetGradVarNameForTrainer(const std::string &varname) const {
if (grads_counter_.find(varname) == grads_counter_.end()) {
grads_counter_[varname] = 0;
}
return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
}
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope();
// FIXME(Yancey1989): initialize rpc server with lazy mode.
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
auto param_list = Attr<std::vector<std::string>>("ParamList");
auto grad_list = Attr<std::vector<std::string>>("GradList");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
framework::Executor executor(dev_place);
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0);
size_t recv_var_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit";
exit_flag = true;
break;
} else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "recv batch barrier message";
batch_barrier++;
continue;
} else {
// receive a variable
recv_var_cnt++;
auto it =
std::find(grad_list.begin(), grad_list.end(), grad_var_name);
std::string param_var_name;
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad has no paired param:" << grad_var_name;
}
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
if (fan_in > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.FindVar(grad_var_name);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << grad_var_name;
PADDLE_THROW("Can not find server side var");
}
detail::DeserializeFromMessage(v.second, dev_ctx, var);
}
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) {
rpc_service_->ShutDown();
}
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt);
grads_counter_.clear();
} // while(true)
}
protected:
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
};
class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddComment(R"DOC(
ListenAndServ operator
This operator will start a RPC server which can receive variables
from send_op and send back variables to recv_op.
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(kOptimizeBlock,
"BlockID to run on server side.");
AddAttr<std::vector<std::string>>(
"ParamList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"GradList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<int>("Fanin", "type int",
"Number of trainers in the current cluster job")
.SetDefault(1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
ops::ListenAndServOpMaker);
......@@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <ostream>
#include <thread>
#include <unistd.h>
#include "paddle/framework/executor.h"
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/grpc_server.h"
#include "paddle/operators/detail/sendrecvop_utils.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h"
#include <future>
#include "paddle/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
}
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
var->GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW(
"VariableMessage type %d is not in "
"[LoDTensor, SelectedRows]",
var_type);
}
}
class RecvOp : public framework::OperatorBase {
public:
RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
server_thread_.reset(new std::thread(RunServer, rpc_service_));
}
}
void Stop() override {
detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg);
rpc_service_->ShutDown();
server_thread_->join();
}
std::string GetGradVarNameForTrainer(const std::string &varname) const {
if (grads_counter_.find(varname) == grads_counter_.end()) {
grads_counter_[varname] = 0;
RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope& scope,
const platform::Place& place) const override {
auto outs = Outputs("Out");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i];
client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
}
return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
PADDLE_ENFORCE(client_.Wait());
}
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope();
// FIXME(Yancey1989): initialize rpc server with laze mode.
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
auto param_list = Attr<std::vector<std::string>>("ParamList");
auto grad_list = Attr<std::vector<std::string>>("GradList");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
framework::Executor executor(dev_place);
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0);
size_t recv_var_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit";
exit_flag = true;
break;
} else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "recv batch barrier message";
batch_barrier++;
continue;
} else {
// receive a variable
recv_var_cnt++;
auto it =
std::find(grad_list.begin(), grad_list.end(), grad_var_name);
std::string param_var_name;
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad has no paired param:" << grad_var_name;
}
VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name;
if (fan_in > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.FindVar(grad_var_name);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << grad_var_name;
PADDLE_THROW("Can not find server side var");
}
detail::DeserializeFromMessage(v.second, dev_ctx, var);
}
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) {
break;
}
try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt);
grads_counter_.clear();
} // while(true)
}
protected:
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
private:
mutable detail::RPCClient client_;
};
class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
AddComment(R"DOC(
Recv operator
This operator will recieve tensor from send_op
This operator can get variables from server side.
)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<framework::BlockDesc *>(
kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
AddAttr<std::vector<std::string>>(
"ParamList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"GradList", "type list of string",
"grad->param name mapping to find which parameters to optimize.")
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping")
.SetDefault({});
AddAttr<int>("Fanin", "type int",
"Number of trainers in the current cluster job")
.SetDefault(1);
}
};
......
......@@ -62,11 +62,13 @@ class SendOp : public framework::OperatorBase {
}
PADDLE_ENFORCE(rpc_client->Wait());
for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
if (outs.size() > 0) {
for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
}
PADDLE_ENFORCE(rpc_client->Wait());
}
PADDLE_ENFORCE(rpc_client->Wait());
}
};
......@@ -85,6 +87,8 @@ Send operator
This operator will send tensor to recv_op at the parameter server.
)DOC");
// TODO(typhoonzero): remove this attr generate de-duplicated vector from
// epmap when initializing.
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/string/printf.h"
USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv);
USE_NO_KERNEL_OP(listen_and_serv);
USE_OP(sum);
namespace f = paddle::framework;
......@@ -33,7 +33,7 @@ namespace p = paddle::platform;
namespace m = paddle::operators::math;
// global for simplicity.
std::unique_ptr<f::OperatorBase> recv_op;
std::unique_ptr<f::OperatorBase> listen_and_serv_op;
void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
p::CPUDeviceContext ctx(place);
......@@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) {
InitTensorsInScope(scope, place);
}
// sub program run in recv_op, for simple test we use sum
// sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape.
......@@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})});
attrs.insert({"OptimizeBlock", block});
recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
recv_op->Run(scope, place);
listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
listen_and_serv_op->Run(scope, place);
}
TEST(SendRecvOp, CPUDense) {
......@@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) {
for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]);
}
recv_op->Stop();
listen_and_serv_op->Stop();
server_thread.join();
recv_op.reset(nullptr);
listen_and_serv_op.reset(nullptr);
}
TEST(SendRecvOp, CPUSparse) {
......@@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) {
EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
actual->mutable_data<float>(place)[i]);
}
recv_op->Stop();
listen_and_serv_op->Stop();
server_thread.join();
recv_op.reset();
listen_and_serv_op.reset();
}
......@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
auto step_scopes =
scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory.");
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
......@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program();
......@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name);
}
dev_ctx.Wait();
const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
}
}
};
......
......@@ -32,7 +32,7 @@ function cmake_gen() {
cat <<EOF
========================================
Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
${PYTHON_FLAGS}
-DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF}
......@@ -55,7 +55,7 @@ EOF
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
${PYTHON_FLAGS} \
-DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \
......
......@@ -27,9 +27,10 @@ int main(int argc, char** argv) {
}
#ifdef PADDLE_WITH_CUDA
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
"warpctc_dir"));
#else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
#endif
int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data();
......
......@@ -76,7 +76,9 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
]
if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use']
core.init_gflags([sys.argv[0]] +
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2
def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
'''
Generate a debug graph for block.
Args:
block(Block): a block.
'''
graph = GraphPreviewGenerator("some graph")
# collect parameters and args
protostr = block.desc.serialize_to_string()
desc = framework_pb2.BlockDesc.FromString(str(protostr))
def need_highlight(name):
if highlights is None: return False
for pattern in highlights:
assert type(pattern) is str
if re.match(pattern, name):
return True
return False
# draw parameters and args
vars = {}
for var in desc.vars:
shape = [str(i) for i in var.lod_tensor.tensor.dims]
if not shape:
shape = ['null']
# create var
if var.persistable:
varn = graph.add_param(
var.name, var.type, shape, highlight=need_highlight(var.name))
else:
varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
vars[var.name] = varn
def add_op_link_var(op, var, op2var=False):
for arg in var.arguments:
if arg not in vars:
# add missing variables as argument
vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
varn = vars[arg]
highlight = need_highlight(op.description) or need_highlight(
varn.description)
if op2var:
graph.add_edge(op, varn, highlight=highlight)
else:
graph.add_edge(varn, op, highlight=highlight)
for op in desc.ops:
opn = graph.add_op(op.type, highlight=need_highlight(op.type))
for var in op.inputs:
add_op_link_var(opn, var, False)
for var in op.outputs:
add_op_link_var(opn, var, True)
graph(path, show=True)
......@@ -478,9 +478,9 @@ class DistributeTranspiler:
else:
self._append_pserver_non_opt_ops(optimize_sub_program,
pserver_program, opt_op)
# Append the recv op
# Append the listen_and_serv op
pserver_program.global_block().append_op(
type="recv",
type="listen_and_serv",
inputs={},
outputs={},
attrs={
......
......@@ -451,9 +451,8 @@ class Operator(object):
if not given == need:
raise ValueError(("Incorrect setting for output(s) of "
"operator \"%s\". Need: [%s] Given: [%s]") %
(type, ", ".join(str(e)
for e in need), ", ".join(
str(e) for e in given)))
(type, ", ".join(str(e) for e in need),
", ".join(str(e) for e in given)))
for out_proto in proto.outputs:
out_args = outputs[out_proto.name]
......@@ -489,7 +488,8 @@ class Operator(object):
no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'parallel_do'
'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine'
}
if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess
import logging
def crepr(v):
if type(v) is str or type(v) is unicode:
return '"%s"' % v
return str(v)
class Rank(object):
def __init__(self, kind, name, priority):
'''
kind: str
name: str
priority: int
'''
self.kind = kind
self.name = name
self.priority = priority
self.nodes = []
def __str__(self):
if not self.nodes:
return ''
return '{' + 'rank={};'.format(self.kind) + \
','.join([node.name for node in self.nodes]) + '}'
class Graph(object):
rank_counter = 0
def __init__(self, title, **attrs):
self.title = title
self.attrs = attrs
self.nodes = []
self.edges = []
self.rank_groups = {}
def code(self):
return self.__str__()
def rank_group(self, kind, priority):
name = "rankgroup-%d" % Graph.rank_counter
Graph.rank_counter += 1
rank = Rank(kind, name, priority)
self.rank_groups[name] = rank
return name
def node(self, label, prefix, description="", **attrs):
node = Node(label, prefix, description, **attrs)
if 'rank' in attrs:
rank = self.rank_groups[attrs['rank']]
del attrs['rank']
rank.nodes.append(node)
self.nodes.append(node)
return node
def edge(self, source, target, **attrs):
edge = Edge(source, target, **attrs)
self.edges.append(edge)
return edge
def compile(self, dot_path):
file = open(dot_path, 'w')
file.write(self.__str__())
image_path = os.path.join(
os.path.dirname(__file__), dot_path[:-3] + "pdf")
cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
logging.warning("write block debug graph to {}".format(image_path))
return image_path
def show(self, dot_path):
image = self.compile(dot_path)
cmd = ["open", image]
subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
def _rank_repr(self):
ranks = sorted(
self.rank_groups.items(),
cmp=lambda a, b: a[1].priority > b[1].priority)
repr = []
for x in ranks:
repr.append(str(x[1]))
return '\n'.join(repr) + '\n'
def __str__(self):
reprs = [
'digraph G {',
'title = {}'.format(crepr(self.title)),
]
for attr in self.attrs:
reprs.append("{key}={value};".format(
key=attr, value=crepr(self.attrs[attr])))
reprs.append(self._rank_repr())
random.shuffle(self.nodes)
reprs += [str(node) for node in self.nodes]
for x in self.edges:
reprs.append(str(x))
reprs.append('}')
return '\n'.join(reprs)
class Node(object):
counter = 1
def __init__(self, label, prefix, description="", **attrs):
self.label = label
self.name = "%s_%d" % (prefix, Node.counter)
self.description = description
self.attrs = attrs
Node.counter += 1
def __str__(self):
reprs = '{name} [label={label} {extra} ];'.format(
name=self.name,
label=self.label,
extra=',' + ','.join("%s=%s" % (key, crepr(value))
for key, value in self.attrs.items())
if self.attrs else "")
return reprs
class Edge(object):
def __init__(self, source, target, **attrs):
'''
Link source to target.
:param source: Node
:param target: Node
:param graph: Graph
:param attrs: dic
'''
self.source = source
self.target = target
self.attrs = attrs
def __str__(self):
repr = "{source} -> {target} {extra}".format(
source=self.source.name,
target=self.target.name,
extra="" if not self.attrs else
"[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
for attr in self.attrs.items()) + "]")
return repr
class GraphPreviewGenerator(object):
'''
Generate a graph image for ONNX proto.
'''
def __init__(self, title):
# init graphviz graph
self.graph = Graph(
title,
layout="dot",
concentrate="true",
rankdir="TB", )
self.op_rank = self.graph.rank_group('same', 2)
self.param_rank = self.graph.rank_group('same', 1)
self.arg_rank = self.graph.rank_group('same', 0)
def __call__(self, path='temp.dot', show=False):
if not show:
self.graph.compile(path)
else:
self.graph.show(path)
def add_param(self, name, data_type, shape, highlight=False):
label = '\n'.join([
'<<table cellpadding="5">',
' <tr>',
' <td bgcolor="#2b787e">',
' <b>',
name,
' </b>',
' </td>',
' </tr>',
' <tr>',
' <td>',
str(data_type),
' </td>'
' </tr>',
' <tr>',
' <td>',
'[%s]' % 'x'.join(shape),
' </td>'
' </tr>',
'</table>>',
])
return self.graph.node(
label,
prefix="param",
description=name,
shape="none",
style="rounded,filled,bold",
width="1.3",
color="#148b97" if not highlight else "orange",
fontcolor="#ffffff",
fontname="Arial")
def add_op(self, opType, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.node(
"<<B>%s</B>>" % opType,
prefix="op",
description=opType,
shape="box",
style="rounded, filled, bold",
color="#303A3A" if not highlight else "orange",
fontname="Arial",
fontcolor="#ffffff",
width="1.3",
height="0.84", )
def add_arg(self, name, highlight=False):
return self.graph.node(
crepr(name),
prefix="arg",
description=name,
shape="box",
style="rounded,filled,bold",
fontname="Arial",
fontcolor="#999999",
color="#dddddd" if not highlight else "orange")
def add_edge(self, source, target, **kwargs):
highlight = False
if 'highlight' in kwargs:
highlight = kwargs['highlight']
del kwargs['highlight']
return self.graph.edge(
source,
target,
color="#00000" if not highlight else "orange",
**kwargs)
......@@ -46,6 +46,9 @@ def is_parameter(var):
def is_persistable(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
return False
return var.persistable
......@@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var):
persistable=True)
def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
def save_vars(executor,
dirname,
main_program=None,
vars=None,
predicate=None,
save_file_name=None):
"""
Save variables to directory by executor.
......@@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default default_main_program.
:param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be saved.
:param vars: variables need to be saved. If specify vars, program & predicate
as a bool. If it returns true, the corresponding input variable will be saved.
:param vars: variables need to be saved. If vars is specified, program & predicate
will be ignored
:param save_file_name: The name of a single file that all vars are saved to.
If it is None, save variables to separate files.
:return: None
"""
if vars is None:
......@@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
save_vars(
executor,
dirname=dirname,
vars=filter(predicate, main_program.list_vars()))
vars=filter(predicate, main_program.list_vars()),
save_file_name=save_file_name)
else:
save_program = Program()
save_block = save_program.global_block()
save_var_map = {}
for each_var in vars:
new_var = _clone_var_in_block_(save_block, each_var)
if save_file_name is None:
save_block.append_op(
type='save',
inputs={'X': [new_var]},
outputs={},
attrs={'file_path': os.path.join(dirname, new_var.name)})
else:
save_var_map[new_var.name] = new_var
if save_file_name is not None:
save_var_list = []
for name in sorted(save_var_map.keys()):
save_var_list.append(save_var_map[name])
save_block.append_op(
type='save',
inputs={'X': [new_var]},
type='save_combine',
inputs={'X': save_var_list},
outputs={},
attrs={'file_path': os.path.join(dirname, new_var.name)})
attrs={'file_path': os.path.join(dirname, save_file_name)})
executor.run(save_program)
def save_params(executor, dirname, main_program=None):
def save_params(executor, dirname, main_program=None, save_file_name=None):
"""
Save all parameters to directory with executor.
"""
......@@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None):
dirname=dirname,
main_program=main_program,
vars=None,
predicate=is_parameter)
predicate=is_parameter,
save_file_name=save_file_name)
def save_persistables(executor, dirname, main_program=None):
def save_persistables(executor, dirname, main_program=None,
save_file_name=None):
"""
Save all persistables to directory with executor.
"""
......@@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None):
dirname=dirname,
main_program=main_program,
vars=None,
predicate=is_persistable)
predicate=is_persistable,
save_file_name=save_file_name)
def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
def load_vars(executor,
dirname,
main_program=None,
vars=None,
predicate=None,
load_file_name=None):
"""
Load variables from directory by executor.
:param executor: executor that save variable
:param executor: executor that load variable
:param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default default_main_program().
:param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be loaded.
:param vars: variables need to be loaded. If specify vars, program &
as a bool. If it returns true, the corresponding input variable will be loaded.
:param vars: variables need to be loaded. If vars is specified, program &
predicate will be ignored
:param load_file_name: The name of the single file that all vars are loaded from.
If it is None, load variables from separate files.
:return: None
"""
if vars is None:
......@@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
load_vars(
executor,
dirname=dirname,
vars=filter(predicate, main_program.list_vars()))
vars=filter(predicate, main_program.list_vars()),
load_file_name=load_file_name)
else:
load_prog = Program()
load_block = load_prog.global_block()
load_var_map = {}
for each_var in vars:
assert isinstance(each_var, Variable)
new_var = _clone_var_in_block_(load_block, each_var)
if load_file_name is None:
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [new_var]},
attrs={'file_path': os.path.join(dirname, new_var.name)})
else:
load_var_map[new_var.name] = new_var
if load_file_name is not None:
load_var_list = []
for name in sorted(load_var_map.keys()):
load_var_list.append(load_var_map[name])
load_block.append_op(
type='load',
type='load_combine',
inputs={},
outputs={"Out": [new_var]},
attrs={'file_path': os.path.join(dirname, new_var.name)})
outputs={"Out": load_var_list},
attrs={'file_path': os.path.join(dirname, load_file_name)})
executor.run(load_prog)
def load_params(executor, dirname, main_program=None):
def load_params(executor, dirname, main_program=None, load_file_name=None):
"""
load all parameters from directory by executor.
"""
......@@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None):
executor,
dirname=dirname,
main_program=main_program,
predicate=is_parameter)
predicate=is_parameter,
load_file_name=load_file_name)
def load_persistables(executor, dirname, main_program=None):
def load_persistables(executor, dirname, main_program=None,
load_file_name=None):
"""
load all persistables from directory by executor.
"""
......@@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None):
executor,
dirname=dirname,
main_program=main_program,
predicate=is_persistable)
predicate=is_persistable,
load_file_name=load_file_name)
def get_inference_program(target_vars, main_program=None):
......@@ -238,7 +298,8 @@ def save_inference_model(dirname,
feeded_var_names,
target_vars,
executor,
main_program=None):
main_program=None,
save_file_name=None):
"""
Build a model especially for inference,
and save it to directory by the executor.
......@@ -249,6 +310,8 @@ def save_inference_model(dirname,
:param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model.
Default default_main_program().
:param save_file_name: The name of a single file that all parameters are saved to.
If it is None, save parameters to separate files.
:return: None
"""
......@@ -283,25 +346,7 @@ def save_inference_model(dirname,
with open(model_file_name, "wb") as f:
f.write(inference_program.desc.serialize_to_string())
save_params(executor, dirname, main_program)
def load_persistables_if_exist(executor, dirname, main_program=None):
filenames = next(os.walk(dirname))[2]
filenames = set(filenames)
def _is_presistable_and_exist_(var):
if not is_persistable(var):
return False
else:
return var.name in filenames
load_vars(
executor,
dirname,
main_program=main_program,
vars=None,
predicate=_is_presistable_and_exist_)
save_persistables(executor, dirname, inference_program, save_file_name)
def get_feed_targets_names(program):
......@@ -322,13 +367,15 @@ def get_fetch_targets_names(program):
return fetch_targets_names
def load_inference_model(dirname, executor):
def load_inference_model(dirname, executor, load_file_name=None):
"""
Load inference model from a directory
:param dirname: directory path
:param executor: executor that load inference model
:param load_file_name: The name of the single file that all parameters are loaded from.
If it is None, load parameters from separate files.
:return: [program, feed_target_names, fetch_targets]
program: program especially for inference.
feed_target_names: Names of variables that need to feed data
......@@ -342,7 +389,7 @@ def load_inference_model(dirname, executor):
program_desc_str = f.read()
program = Program.parse_from_string(program_desc_str)
load_persistables_if_exist(executor, dirname, program)
load_persistables(executor, dirname, program, load_file_name)
feed_target_names = get_feed_targets_names(program)
fetch_target_names = get_fetch_targets_names(program)
......@@ -359,6 +406,7 @@ def get_parameter_value(para, executor):
:param executor: executor for retrieving the value
:param para: the given parameter
:return: the LoDTensor for the parameter
"""
assert is_parameter(para)
......@@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None):
:param name: the name of the parameter
:param program: the program where the variable is found
Default default_main_program().
:return: the LoDTensor for the variable
"""
if program is None:
......
......@@ -108,7 +108,7 @@ class ListenAndServ(object):
"""
def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("recv")
self.helper = LayerHelper("listen_and_serv")
self.inputs = []
self.outputs = []
self.endpoint = endpoint
......@@ -158,7 +158,7 @@ class ListenAndServ(object):
param_names = [p.name for p in params]
grad_names = [g.name for g in grads]
parent_block.append_op(
type='recv',
type='listen_and_serv',
inputs={},
outputs={},
attrs={
......@@ -196,3 +196,31 @@ def Send(endpoints, send_vars, get_vars):
outputs={"Out": get_vars},
attrs={"endpoints": endpoints,
"epmap": epmap})
def Recv(endpoints, get_vars):
"""
Recv layer
Args:
endpoints: comma seperated IP:PORT pairs in the order
of send_vars to send
send_vars: vars to send
get_vars: vars to get from server after send completes.
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
assert (type(send_vars) == list)
assert (type(get_vars) == list)
epmap = endpoints.split(",")
endpoints = list(set(epmap))
helper = LayerHelper("Recv", **locals())
helper.append_op(
type="recv",
inputs={"X": get_vars},
outputs={"Out": get_vars},
attrs={"endpoints": endpoints,
"epmap": epmap})
......@@ -59,6 +59,7 @@ __all__ = [
'elementwise_pow',
'clip',
'clip_by_norm',
'softmax',
'sequence_softmax',
] + __activations__
......
......@@ -295,7 +295,7 @@ def fill_constant_batch_size_like(input,
return out
def ones(shape, dtype):
def ones(shape, dtype, force_cpu=False):
"""
**ones**
......@@ -319,7 +319,7 @@ def ones(shape, dtype):
return fill_constant(value=1.0, **locals())
def zeros(shape, dtype):
def zeros(shape, dtype, force_cpu=False):
"""
**zeros**
......
......@@ -31,7 +31,7 @@ dtype_to_size = {
class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num):
def __init__(self, Program, ops, forward_num, skip_opt):
self._program = Program
self._ops = ops
self._forward_num = forward_num
......@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
self._defs = defaultdict(set)
self._live_in = defaultdict(set)
self._live_out = defaultdict(set)
self._skip_opt = skip_opt
def _add_connections(self, connections):
for node1, node2 in connections:
......@@ -130,6 +131,10 @@ class ControlFlowGraph(object):
block_desc, x,
is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
return False
if x in self._skip_opt:
return False
if not self._find_var(block_desc, x, is_forward).shape():
return False
return True
self._build_graph()
......@@ -140,6 +145,7 @@ class ControlFlowGraph(object):
if op.type() == "while" or op.type() == "while_grad":
continue
block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num
if self.pool:
defs_can_optimize = filter(
......@@ -197,28 +203,32 @@ def get_cfgs(input_program):
block_desc = pdesc.block(0)
op_size = block_desc.op_size()
# Get global block ops
ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set()))
while_sub_block_ids = []
while_grad_sub_block_ids = []
while_pair = []
while_op_output = set()
while_block_id_pair = []
for i in range(op_size):
op = block_desc.op(i)
if op.type() == "while":
while_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
elif op.type() == "while_grad":
while_grad_sub_block_ids.append(op.attr("sub_block").id)
while_op_output.update(op.output_arg_names())
# Find while/while_grad block pair
for grad_id in while_grad_sub_block_ids:
parent_id = pdesc.block(grad_id).parent
if parent_id in while_sub_block_ids:
while_pair.append((parent_id, grad_id))
while_block_id_pair.append((parent_id, grad_id))
while_sub_block_ids.remove(parent_id)
# Get while/while_grad block ops
for parent_id, grad_id in while_pair:
for parent_id, grad_id in while_block_id_pair:
while_block_ops = []
while_block = pdesc.block(parent_id)
while_block_op_size = while_block.op_size()
......@@ -230,7 +240,7 @@ def get_cfgs(input_program):
for i in range(while_grad_block_op_size):
while_block_ops.append(while_grad_block.op(i))
ops_list.append((while_block_ops, while_block_op_size))
ops_list.append((while_block_ops, while_block_op_size, while_op_output))
# Process rest while block ops
for parent_id in while_sub_block_ids:
......@@ -242,7 +252,7 @@ def get_cfgs(input_program):
ops_list.append((while_block_ops, while_block_op_size))
cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
return cfgs
......
......@@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op)
endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
add_subdirectory(book)
add_subdirectory(book_distribute)
......
recognize_digits_*.inference.model
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
py_test(test_recognize_digits_mlp_cpu
SRCS test_recognize_digits.py
ARGS mlp)
py_test(test_recognize_digits_mlp_cuda
SRCS test_recognize_digits.py
ARGS mlp --use_cuda)
py_test(test_recognize_digits_conv_cpu
SRCS test_recognize_digits.py
ARGS conv)
py_test(test_recognize_digits_conv_cuda
SRCS test_recognize_digits.py
ARGS conv --use_cuda)
py_test(test_recognize_digits_mlp_cpu_parallel
SRCS test_recognize_digits.py
ARGS mlp --parallel)
py_test(test_recognize_digits_mlp_cuda_parallel
SRCS test_recognize_digits.py
ARGS mlp --use_cuda --parallel)
py_test(test_recognize_digits_conv_cpu_parallel
SRCS test_recognize_digits.py
ARGS conv --parallel)
py_test(test_recognize_digits_conv_cuda_parallel
SRCS test_recognize_digits.py
ARGS conv --use_cuda --parallel)
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
......
......@@ -12,44 +12,74 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import contextlib
import unittest
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
y_predict = fluid.layers.fc(input=x, size=1, act=None)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
BATCH_SIZE = 20
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
BATCH_SIZE = 20
exe.run(fluid.default_startup_program())
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
PASS_NUM = 100
for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
print(avg_loss_value)
if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
print(avg_loss_value)
if avg_loss_value[0] < 10.0:
return
raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
avg_loss_value[0]))
class TestFitALine(unittest.TestCase):
def test_cpu(self):
with self.program_scope_guard():
main(use_cuda=False)
def test_cuda(self):
with self.program_scope_guard():
main(use_cuda=True)
@contextlib.contextmanager
def program_scope_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
unittest.main()
......@@ -14,10 +14,10 @@
from __future__ import print_function
import sys
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import unittest
import contextlib
def resnet_cifar10(input, depth=32):
......@@ -89,56 +89,89 @@ def vgg16_bn_drop(input):
return fc2
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
net_type = "vgg"
if len(sys.argv) >= 2:
net_type = sys.argv[1]
if net_type == "vgg":
print("train vgg net")
net = vgg16_bn_drop(images)
elif net_type == "resnet":
print("train resnet")
net = resnet_cifar10(images, 32)
else:
raise ValueError("%s network is not supported" % net_type)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 128
PASS_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc))
# this model is slow, so if we can train two mini batch, we think it works properly.
exit(0)
exit(1)
def main(net_type, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
classdim = 10
data_shape = [3, 32, 32]
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if net_type == "vgg":
print("train vgg net")
net = vgg16_bn_drop(images)
elif net_type == "resnet":
print("train resnet")
net = resnet_cifar10(images, 32)
else:
raise ValueError("%s network is not supported" % net_type)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 128
PASS_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=128 * 10),
batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program())
loss = 0.0
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc))
return
raise AssertionError(
"Image classification loss is too large, {0:2.2}".format(loss))
class TestImageClassification(unittest.TestCase):
def test_vgg_cuda(self):
with self.scope_prog_guard():
main('vgg', use_cuda=True)
def test_resnet_cuda(self):
with self.scope_prog_guard():
main('resnet', use_cuda=True)
def test_vgg_cpu(self):
with self.scope_prog_guard():
main('vgg', use_cuda=False)
def test_resnet_cpu(self):
with self.scope_prog_guard():
main('resnet', use_cuda=False)
@contextlib.contextmanager
def scope_prog_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
unittest.main()
......@@ -11,21 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as pd
from paddle.v2.fluid.executor import Executor
import unittest
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32
word_dim = 16
IS_SPARSE = True
batch_size = 2
max_length = 8
topk_size = 50
......@@ -34,10 +33,8 @@ beam_size = 2
decoder_size = hidden_dim
place = core.CPUPlace()
def encoder():
def encoder(is_sparse):
# encoder
src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1)
......@@ -45,7 +42,7 @@ def encoder():
input=src_word_id,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
......@@ -54,7 +51,7 @@ def encoder():
return encoder_out
def decoder_train(context):
def decoder_train(context, is_sparse):
# decoder
trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
......@@ -62,7 +59,7 @@ def decoder_train(context):
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
rnn = pd.DynamicRNN()
......@@ -82,10 +79,10 @@ def decoder_train(context):
return rnn()
def decoder_decode(context):
def decoder_decode(context, is_sparse):
init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64')
counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = pd.create_array('float32')
......@@ -117,7 +114,7 @@ def decoder_decode(context):
input=pre_ids,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE)
is_sparse=is_sparse)
# use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
......@@ -150,7 +147,7 @@ def decoder_decode(context):
def set_init_lod(data, lod, place):
res = core.LoDTensor()
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
......@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def train_main():
context = encoder()
rnn_out = decoder_train(context)
def train_main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
rnn_out = decoder_train(context, is_sparse)
label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label)
......@@ -212,9 +213,13 @@ def train_main():
batch_id += 1
def decode_main():
context = encoder()
translation_ids, translation_scores = decoder_decode(context)
def decode_main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse)
translation_ids, translation_scores = decoder_decode(context, is_sparse)
exe = Executor(place)
exe.run(framework.default_startup_program())
......@@ -250,6 +255,60 @@ def decode_main():
break
class TestMachineTranslation(unittest.TestCase):
pass
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def inject_test_train(use_cuda, is_sparse):
f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
train_main(use_cuda, is_sparse)
setattr(TestMachineTranslation, f_name, f)
def inject_test_decode(use_cuda, is_sparse, decorator=None):
f_name = 'test_{0}_{1}_decode'.format('cuda'
if use_cuda else 'cpu', 'sparse'
if is_sparse else 'dense')
def f(*args):
with scope_prog_guard():
decode_main(use_cuda, is_sparse)
if decorator is not None:
f = decorator(f)
setattr(TestMachineTranslation, f_name, f)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
inject_test_train(_use_cuda_, _is_sparse_)
for _use_cuda_ in (False, True):
for _is_sparse_ in (False, True):
_decorator_ = None
if _use_cuda_:
_decorator_ = unittest.skip(
reason='Beam Search does not support CUDA!')
inject_test_decode(
is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
if __name__ == '__main__':
# train_main()
decode_main()
unittest.main()
......@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import sys
import numpy
import unittest
def parse_arg():
......@@ -74,18 +75,18 @@ def conv_net(img, label):
return loss_net(conv_pool_2, label)
def train(args, save_dirname=None):
print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
def train(nn_type, use_cuda, parallel, save_dirname):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.nn_type == 'mlp':
if nn_type == 'mlp':
net_conf = mlp
else:
net_conf = conv_net
if args.parallel:
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
......@@ -107,7 +108,7 @@ def train(args, save_dirname=None):
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -147,13 +148,14 @@ def train(args, save_dirname=None):
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
raise AssertionError("Loss of recognize digits is too large")
def infer(args, save_dirname=None):
def infer(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
......@@ -174,11 +176,48 @@ def infer(args, save_dirname=None):
print("infer results: ", results[0])
if __name__ == '__main__':
args = parse_arg()
if not args.use_cuda and not args.parallel:
save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
def main(use_cuda, parallel, nn_type):
if not use_cuda and not parallel:
save_dirname = "recognize_digits_" + nn_type + ".inference.model"
else:
save_dirname = None
train(args, save_dirname)
infer(args, save_dirname)
train(
nn_type=nn_type,
use_cuda=use_cuda,
parallel=parallel,
save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
class TestRecognizeDigits(unittest.TestCase):
pass
def inject_test_method(use_cuda, parallel, nn_type):
def __impl__(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda, parallel, nn_type)
fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
if use_cuda else 'cpu', 'parallel'
if parallel else 'normal')
setattr(TestRecognizeDigits, fn, __impl__)
def inject_all_tests():
for use_cuda in (False, True):
for parallel in (False, True):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type)
inject_all_tests()
if __name__ == '__main__':
unittest.main()
......@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_softmax(x=seq))
print(str(program))
def test_softmax(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[10], dtype='float32')
hid = layers.fc(input=data, size=20)
self.assertIsNotNone(layers.softmax(x=hid))
print(str(program))
def test_get_places(self):
program = Program()
with program_guard(program):
......
......@@ -19,6 +19,7 @@ import paddle.v2.fluid.layers as layers
import numpy
from multiprocessing import Process
import os, sys
import time
class TestRecvOp(unittest.TestCase):
......@@ -28,6 +29,7 @@ class TestRecvOp(unittest.TestCase):
p = Process(target=self.init_serv, args=(place, ))
p.daemon = True
p.start()
time.sleep(1)
self.init_client(place)
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
os.system("kill -9 %d" % p.pid)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册