提交 e5227c2c 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_make_VarDesc_supporting_multiple_tensor
...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
......
...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO ...@@ -21,6 +21,7 @@ set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOO
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR}) include_directories(${BOOST_INCLUDE_DIR})
......
...@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn ...@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
${SPHINX_CACHE_DIR_CN} ${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN}) ${SPHINX_HTML_DIR_CN})
add_subdirectory(api)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)
sphinx_add_target(paddle_api_docs
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
...@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue. In Go, its implemented i ...@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue. In Go, its implemented i
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll. The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax. It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
### Type Channel ### Type Channel
...@@ -71,14 +71,14 @@ ch1 := make(chan int, 100) // a channel that can buffer 100 ints. ...@@ -71,14 +71,14 @@ ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
In Fluid, we should be able to do the same: In Fluid, we should be able to do the same:
```python ```python
ch = fluid.make_chan(dtype=INT) ch = fluid.make_channel(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100) ch1 = fluid.make_channel(dtype=INT, 100)
``` ```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16: In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python ```python
ch = fluid.make_chan(dtype=Tensor, etype=float16) ch = fluid.make_channel(dtype=Tensor, etype=float16)
``` ```
or Tensors of Tensors of float16 etc. or Tensors of Tensors of float16 etc.
...@@ -87,8 +87,135 @@ The point here is that we need a consistent way to compose types, like in C++ we ...@@ -87,8 +87,135 @@ The point here is that we need a consistent way to compose types, like in C++ we
### Send and Recv ### Send and Recv
Go's CSP implementation depends on data type *channel*. There are two types of channels:
1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
There are four types of actions with a channel:
1. Create a channel
```go
ch := make(chan int) // this is an unbuffered channel
ch := make(chan int, 100) // this is a buffered channel of 100 ints.
```
1. Send
```go
ch <- 111
```
1. Recv
```go
y, ok <- ch
```
1. Close
```go
close(ch)
```
Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
1. A send to a nil channel blocks forever
1. A receive from a nil channel blocks forever
1. A send to a closed channel panics
1. A receive from a closed channel returns the residual values and then zeros.
In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
The following program illustrates the Python syntax for accessing Fluid buffers.
```python
import fluid
buffer_size = 10
ch = fluid.make_channel(dtype=INT, buffer_size)
# Now write three elements to the channel
with fluid.while(steps=buffer_size):
fluid.send(ch, step)
fluid.close_channel(ch)
with fluid.while(steps=buffer_size):
fluid.print(fluid.recv(ch))
```
The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
```python
import fluid
ch = fluid.make_channel(dtype=INT)
with fluid.go():
fluid.send(ch)
y = fluid.recv(ch)
fluid.close_channel(ch)
```
### Select ### Select
In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
```go
ch1 := make(chan int)
ch2 := make(chan int, 100)
x := 0
for {
select {
case ch1 <- x:
x := x + 1
case y <- ch2:
fmt.Println("Received on channel")
default:
fmt.Println("Default")
}
}
```
In Fluid, we should be able to do the same:
```python
ch1 = fluid.make_chan(dtype=INT)
ch2 = fluid.make_chan(dtype=INT, 100)
sel = fluid.select()
with sel.case(ch1, 'w', X):
fluid.layers.increment(X)
with sel.case(ch2, 'r', Y):
fluid.print("Received on Channel")
with sel.default():
fluid.print("Default")
```
In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
## Example Programs ## Example Programs
### 1. RPC between Trainers and Parameter Servers ### 1. RPC between Trainers and Parameter Servers
......
...@@ -8,4 +8,3 @@ PaddlePaddle 文档 ...@@ -8,4 +8,3 @@ PaddlePaddle 文档
howto/index_cn.rst howto/index_cn.rst
api/index_cn.rst api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
mobile/index_cn.rst
...@@ -7,4 +7,3 @@ PaddlePaddle Documentation ...@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
mobile/index_en.rst
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_ios_en.md
cross_compiling_for_raspberry_en.md
...@@ -23,8 +23,8 @@ namespace framework { ...@@ -23,8 +23,8 @@ namespace framework {
template <typename T> template <typename T>
class Channel { class Channel {
public: public:
virtual void Send(T*) = 0; virtual bool Send(T*) = 0;
virtual void Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Close() = 0; virtual void Close() = 0;
virtual ~Channel() {} virtual ~Channel() {}
......
...@@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { ...@@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10; const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // should not block EXPECT_EQ(ch->Send(&i), true); // should not block
} }
size_t out; size_t out;
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
ch->Receive(&out); // should not block EXPECT_EQ(ch->Receive(&out), true); // should not block
EXPECT_EQ(out, i); EXPECT_EQ(out, i);
} }
CloseChannel(ch); CloseChannel(ch);
...@@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() { std::thread t([&]() {
// Try to write more than buffer size. // Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) { for (size_t i = 0; i < 2 * buffer_size; ++i) {
ch->Send(&i); // should not block if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
else
EXPECT_EQ(ch->Send(&i), false);
sum += i; sum += i;
} }
}); });
...@@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) { ...@@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ch->Send(&i); EXPECT_EQ(ch->Send(&i), true);
sum_send += i; sum_send += i;
} }
}); });
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
int recv; int recv;
ch->Receive(&recv); EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i); EXPECT_EQ(recv, i);
} }
...@@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) { ...@@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
delete ch; delete ch;
} }
// This tests that closing a buffered channel also unblocks
// any receivers waiting on the channel
TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
int data;
// All reads should return false
EXPECT_EQ(ch->Receive(&data), false);
*p = true;
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// Explicitly close the channel
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing a buffered channel also unblocks
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
[&](bool *ended, bool *success) {
int data = 10;
*success = ch->Send(&data);
*ended = true;
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
// Explicitly close the thread
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Verify that only 1 send was successful
ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
for (size_t i = 0; i < num_threads; i++) t[i].join();
delete ch;
}
// This tests that closing an unbuffered channel also unblocks // This tests that closing an unbuffered channel also unblocks
// unblocks any receivers waiting for senders // unblocks any receivers waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
...@@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { ...@@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
int data; int data;
ch->Receive(&data); EXPECT_EQ(ch->Receive(&data), false);
*p = true; *p = true;
}, },
&thread_ended[i]); &thread_ended[i]);
...@@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { ...@@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
int data = 10; int data = 10;
ch->Send(&data); EXPECT_EQ(ch->Send(&data), false);
*p = true; *p = true;
}, },
&thread_ended[i]); &thread_ended[i]);
...@@ -207,3 +306,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { ...@@ -207,3 +306,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
t.join(); t.join();
delete ch; delete ch;
} }
TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
auto ch = MakeChannel<int>(0);
unsigned sum_send = 0;
unsigned sum_receive = 0;
// The receiver should block after 5
// iterations, since there are only 5 senders.
std::thread t([&]() {
for (int i = 0; i < 8; i++) {
int recv;
ch->Receive(&recv); // should block after the fifth iteration.
EXPECT_EQ(recv, i);
sum_receive += i;
}
});
for (int i = 0; i < 5; i++) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
for (int i = 5; i < 8; i++) {
ch->Send(&i);
sum_send += i;
}
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 28U);
EXPECT_EQ(sum_receive, 28U);
delete ch;
}
...@@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Close(); virtual void Close();
virtual ~Buffered(); virtual ~Buffered();
...@@ -48,39 +48,43 @@ class Buffered : public paddle::framework::Channel<T> { ...@@ -48,39 +48,43 @@ class Buffered : public paddle::framework::Channel<T> {
PADDLE_ENFORCE_GT(cap, 0); PADDLE_ENFORCE_GT(cap, 0);
} }
void NotifyAllSenders(std::unique_lock<std::mutex>*); void NotifyAllParticipants(std::unique_lock<std::mutex>*);
}; };
template <typename T> template <typename T>
void Buffered<T>::Send(T* item) { bool Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock, full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; }); [this]() { return channel_.size() < cap_ || closed_; });
bool ret = false;
if (!closed_) { if (!closed_) {
channel_.push_back(std::move(*item)); channel_.push_back(std::move(*item));
lock.unlock(); lock.unlock();
empty_cond_var_.notify_one(); empty_cond_var_.notify_one();
ret = true;
} }
return ret;
} }
template <typename T> template <typename T>
void Buffered<T>::Receive(T* item) { bool Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
bool ret = false;
if (!closed_) { if (!closed_) {
*item = std::move(channel_.front()); *item = std::move(channel_.front());
channel_.pop_front(); channel_.pop_front();
NotifyAllSenders(&lock); full_cond_var_.notify_one();
} else { ret = true;
item = nullptr;
} }
return ret;
} }
template <typename T> template <typename T>
void Buffered<T>::Close() { void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
closed_ = true; closed_ = true;
NotifyAllSenders(&lock); NotifyAllParticipants(&lock);
} }
template <typename T> template <typename T>
...@@ -88,13 +92,14 @@ Buffered<T>::~Buffered() { ...@@ -88,13 +92,14 @@ Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_); std::unique_lock<std::mutex> lock(mu_);
closed_ = true; closed_ = true;
channel_.clear(); channel_.clear();
NotifyAllSenders(&lock); NotifyAllParticipants(&lock);
} }
template <typename T> template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) { void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock(); lock->unlock();
full_cond_var_.notify_all(); full_cond_var_.notify_all();
empty_cond_var_.notify_all();
} }
} // namespace details } // namespace details
......
...@@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel<T> { ...@@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T>*); friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public: public:
virtual void Send(T*); virtual bool Send(T*);
virtual void Receive(T*); virtual bool Receive(T*);
virtual size_t Cap() { return 0; } virtual size_t Cap() { return 0; }
virtual void Close(); virtual void Close();
virtual ~UnBuffered(); virtual ~UnBuffered();
...@@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel<T> { ...@@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel<T> {
// This function implements the concept of how data should // This function implements the concept of how data should
// be sent from a writer to a reader. // be sent from a writer to a reader.
template <typename T> template <typename T>
void UnBuffered<T>::Send(T* data) { bool UnBuffered<T>::Send(T* data) {
// Prevent other writers from entering // Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_); std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true; writer_found_ = true;
...@@ -66,6 +66,7 @@ void UnBuffered<T>::Send(T* data) { ...@@ -66,6 +66,7 @@ void UnBuffered<T>::Send(T* data) {
cv_writer_.wait(cv_lock, cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; }); [this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one(); cv_reader_.notify_one();
bool ret = false;
if (!closed_) { if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_); std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data; item = data;
...@@ -74,14 +75,16 @@ void UnBuffered<T>::Send(T* data) { ...@@ -74,14 +75,16 @@ void UnBuffered<T>::Send(T* data) {
channel_lock.lock(); channel_lock.lock();
cv_channel_.wait(channel_lock, cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; }); [this]() { return item == nullptr || closed_; });
ret = true;
} }
writer_found_ = false; writer_found_ = false;
return ret;
} }
// This function implements the concept of how // This function implements the concept of how
// data that was sent by a writer is read from a reader. // data that was sent by a writer is read from a reader.
template <typename T> template <typename T>
void UnBuffered<T>::Receive(T* data) { bool UnBuffered<T>::Receive(T* data) {
// Prevent other readers from entering // Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_}; std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true; reader_found_ = true;
...@@ -90,6 +93,7 @@ void UnBuffered<T>::Receive(T* data) { ...@@ -90,6 +93,7 @@ void UnBuffered<T>::Receive(T* data) {
cv_reader_.wait(cv_lock, cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; }); [this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one(); cv_writer_.notify_one();
bool ret = false;
if (!closed_) { if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_}; std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data // Reader should wait for the writer to first write its data
...@@ -98,10 +102,12 @@ void UnBuffered<T>::Receive(T* data) { ...@@ -98,10 +102,12 @@ void UnBuffered<T>::Receive(T* data) {
*data = std::move(*item); *data = std::move(*item);
item = nullptr; item = nullptr;
lock_ch.unlock(); lock_ch.unlock();
ret = true;
} }
cv_channel_.notify_one(); cv_channel_.notify_one();
} }
reader_found_ = false; reader_found_ = false;
return ret;
} }
// This function implements the sequence of events // This function implements the sequence of events
......
...@@ -58,6 +58,47 @@ void TestInference(const std::string& dirname, ...@@ -58,6 +58,47 @@ void TestInference(const std::string& dirname,
delete scope; delete scope;
} }
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
T lower,
T upper) {
srand(time(0));
float* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) {
input_ptr[i] =
(static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
lower;
}
}
template <typename T>
void CheckError(paddle::framework::LoDTensor& output1,
paddle::framework::LoDTensor& output2) {
// Check lod information
EXPECT_EQ(output1.lod(), output2.lod());
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
T err = static_cast<T>(0);
if (typeid(T) == typeid(float)) {
err = 1E-3;
} else if (typeid(T) == typeid(double)) {
err = 1E-6;
} else {
err = 0;
}
size_t count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
}
TEST(inference, recognize_digits) { TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) { if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
...@@ -70,12 +111,10 @@ TEST(inference, recognize_digits) { ...@@ -70,12 +111,10 @@ TEST(inference, recognize_digits) {
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
srand(time(0)); // Use normilized image pixels as input data,
float* input_ptr = // which should be in the range [-1.0, 1.0].
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace()); SetupTensor<float>(
for (int i = 0; i < 784; ++i) { input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
...@@ -98,16 +137,6 @@ TEST(inference, recognize_digits) { ...@@ -98,16 +137,6 @@ TEST(inference, recognize_digits) {
dirname, cpu_feeds, cpu_fetchs2); dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims()); CheckError<float>(output1, output2);
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif #endif
} }
...@@ -158,7 +158,10 @@ op_library(parallel_do_op DEPS executor) ...@@ -158,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
# Regist multiple Kernel to pybind # Regist multiple Kernel to pybind
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
vol2col depthwise_conv)
op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function) op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling) op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
......
...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise convolution op
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_grad,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>); ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -16,6 +16,16 @@ limitations under the License. */ ...@@ -16,6 +16,16 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>, conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>); ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h" #include "paddle/operators/math/vol2col.h"
...@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
} }
} }
}; };
template <typename DeviceContext, typename T>
class DepthwiseConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_EQ(
output->dims()[1] % input->dims()[1], 0,
"The output channels must be a multiple of the input channels");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
auto& dev_ctx = context.template device_context<DeviceContext>();
depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>();
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
paddings, input_grad);
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
filter_grad);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -8,6 +8,7 @@ if(WITH_GPU) ...@@ -8,6 +8,7 @@ if(WITH_GPU)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
......
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/depthwise_conv.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template <typename T>
__global__ void KernelDepthwiseConv(
const int nthreads, const T* const input_data, const T* const filter_data,
const int batch_size, const int output_channels, const int output_height,
const int output_width, const int input_channels, const int input_height,
const int input_width, const int filter_multiplier, const int filter_height,
const int filter_width, const int stride_height, const int stride_width,
const int padding_height, const int padding_width, T* const output_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / output_channels / output_height / output_width;
const int c_out = (index / output_height / output_width) % output_channels;
const int h_out = (index / output_width) % output_height;
const int w_out = index % output_width;
const int c_in = c_out / filter_multiplier;
const T* weight = filter_data + c_out * filter_height * filter_width;
T value = 0;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end = h_in_start + filter_height;
const int w_in_end = w_in_start + filter_width;
const int in_offset =
((batch * input_channels + c_in) * input_height) * input_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
value +=
weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
input_data[offset];
}
}
output_data[index] = value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <typename T>
__global__ void KernelDepthwiseConvInputGrad(
const int nthreads, const T* const output_grad_data,
const T* const filter_data, const int batch_size, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const input_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / input_channels / input_height / input_width;
const int c_in = (index / input_height / input_width) % input_channels;
const int h_in = (index / input_width) % input_height;
const int w_in = index % input_width;
const int c_out_start = c_in * filter_multiplier;
int h_out_start =
(h_in - filter_height + padding_height + stride_height) / stride_height;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + padding_height) / stride_height;
h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
int w_out_start =
(w_in - filter_width + padding_width + stride_width) / stride_width;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + padding_width) / stride_width;
w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
T value = 0;
for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
c_out++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + padding_height - h_out * stride_height;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + padding_width - w_out * stride_width;
const int filter_offset = c_out * filter_height * filter_width +
filter_h * filter_width + filter_w;
const int output_grad_offset =
((batch * output_channels + c_out) * output_height + h_out) *
output_width +
w_out;
value +=
output_grad_data[output_grad_offset] * filter_data[filter_offset];
}
}
}
input_grad_data[index] += value;
}
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T>
__global__ void KernelDepthwiseConvFilterGrad(
const int nthreads, const T* const output_grad_data,
const T* const input_data, const int num, const int output_channels,
const int output_height, const int output_width, const int input_channels,
const int input_height, const int input_width, const int filter_multiplier,
const int filter_height, const int filter_width, const int stride_height,
const int stride_width, const int padding_height, const int padding_width,
T* const filter_grad_data) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int w_out = index % output_width;
const int h_out = (index / output_width) % output_height;
const int c_out = (index / output_width / output_height) % output_channels;
const int batch = (index / output_width / output_height / output_channels);
const int c_in = c_out / filter_multiplier;
const int h_in_start = -padding_height + h_out * stride_height;
const int w_in_start = -padding_width + w_out * stride_width;
const int h_in_end =
-padding_height + h_out * stride_height + filter_height;
const int w_in_end = -padding_width + w_out * stride_width + filter_width;
const int in_offset =
(batch * input_channels + c_in) * input_height * input_width;
T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
const int h_end = h_in_end < input_height ? h_in_end : input_height;
const int w_end = w_in_end < input_width ? w_in_end : input_width;
const int h_start = h_in_start > 0 ? h_in_start : 0;
const int w_start = w_in_start > 0 ? w_in_start : 0;
for (int h_in = h_start; h_in < h_end; h_in++) {
for (int w_in = w_start; w_in < w_end; w_in++) {
const int offset = in_offset + h_in * input_width + w_in;
const T diff_temp = output_grad_data[index] * input_data[offset];
T* addr = addr_offset + (h_in - h_in_start) * filter_width +
(w_in - w_in_start);
paddle::platform::CudaAtomicAdd(addr, diff_temp);
}
}
}
}
/*
* All tensors are in NCHW format.
* Ksize, strides, paddings are two elements. These two elements represent
* height and width, respectively.
*/
template <class T>
class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
output_data);
}
};
template <typename T>
class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter.dims()[2];
const int ksize_width = filter.dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, filter_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
input_grad_data);
}
};
template <typename T>
class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3];
const int ksize_height = filter_grad->dims()[2];
const int ksize_width = filter_grad->dims()[3];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
nthreads, output_grad_data, input_data, batch_size, output_channels,
output_height, output_width, input_channels, input_height, input_width,
output_channels / input_channels, ksize_height, ksize_width,
stride_height, stride_width, padding_height, padding_width,
filter_grad_data);
}
};
template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
double>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
float>;
template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
/*
* \brief Compute the depthwise convolution which include
* forward process and backpropagation process
*/
template <typename DeviceContext, typename T>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& filter,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad);
};
template <typename DeviceContext, typename T>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* filter_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -85,7 +85,7 @@ static inline void GetMaxScoreIndex( ...@@ -85,7 +85,7 @@ static inline void GetMaxScoreIndex(
std::stable_sort(sorted_indices->begin(), sorted_indices->end(), std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
SortScorePairDescend<int>); SortScorePairDescend<int>);
// Keep top_k scores if needed. // Keep top_k scores if needed.
if (top_k > -1 && top_k < sorted_indices->size()) { if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
sorted_indices->resize(top_k); sorted_indices->resize(top_k);
} }
} }
...@@ -151,7 +151,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -151,7 +151,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
while (sorted_indices.size() != 0) { while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second; const int idx = sorted_indices.front().second;
bool keep = true; bool keep = true;
for (int k = 0; k < selected_indices->size(); ++k) { for (size_t k = 0; k < selected_indices->size(); ++k) {
if (keep) { if (keep) {
const int kept_idx = (*selected_indices)[k]; const int kept_idx = (*selected_indices)[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size, T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
...@@ -201,7 +201,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -201,7 +201,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
int label = it.first; int label = it.first;
const T* sdata = scores_data + label * predict_dim; const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& label_indices = it.second; const std::vector<int>& label_indices = it.second;
for (int j = 0; j < label_indices.size(); ++j) { for (size_t j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j]; int idx = label_indices[j];
PADDLE_ENFORCE_LT(idx, predict_dim); PADDLE_ENFORCE_LT(idx, predict_dim);
score_index_pairs.push_back( score_index_pairs.push_back(
...@@ -215,7 +215,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -215,7 +215,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
// Store the new indices. // Store the new indices.
std::map<int, std::vector<int>> new_indices; std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) { for (size_t j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first; int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second; int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx); new_indices[label].push_back(idx);
...@@ -238,7 +238,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -238,7 +238,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
int label = it.first; int label = it.first;
const T* sdata = scores_data + label * predict_dim; const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& indices = it.second; const std::vector<int>& indices = it.second;
for (int j = 0; j < indices.size(); ++j) { for (size_t j = 0; j < indices.size(); ++j) {
int idx = indices[j]; int idx = indices[j];
const T* bdata = bboxes_data + idx * kBBoxSize; const T* bdata = bboxes_data + idx * kBBoxSize;
odata[count * kOutputDim] = label; // label odata[count * kOutputDim] = label; // label
......
...@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios"); auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
bool flip = ctx->Attrs().Get<bool>("flip"); bool flip = ctx->Attrs().Get<bool>("flip");
PADDLE_ENFORCE_GT(min_sizes.size(), 0,
"Size of min_sizes must be at least 1.");
for (size_t i = 0; i < min_sizes.size(); ++i) {
PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
}
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
...@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
} }
} }
PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
for (size_t i = 0; i < variances.size(); ++i) {
PADDLE_ENFORCE_GT(variances[i], 0.0,
"variance[%d] must be greater than 0.", i);
}
const float step_h = ctx->Attrs().Get<float>("step_h");
PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
const float step_w = ctx->Attrs().Get<float>("step_w");
PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
std::vector<int64_t> dim_vec(4); std::vector<int64_t> dim_vec(4);
dim_vec[0] = input_dims[2]; dim_vec[0] = input_dims[2];
dim_vec[1] = input_dims[3]; dim_vec[1] = input_dims[3];
...@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
"PriorBoxOp. The layout is [H, W, num_priors, 4]. " "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
"H is the height of input, W is the width of input, num_priors " "H is the height of input, W is the width of input, num_priors "
"is the box count of each position."); "is the box count of each position.");
AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
"List of min sizes of generated prior boxes."); AddAttr<std::vector<int>>("min_sizes",
AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ", "(vector<int>) List of min sizes "
"List of max sizes of generated prior boxes."); "of generated prior boxes.")
.AddCustomChecker([](const std::vector<int>& min_sizes) {
PADDLE_ENFORCE_GT(min_sizes.size(), 0,
"Size of min_sizes must be at least 1.");
for (size_t i = 0; i < min_sizes.size(); ++i) {
PADDLE_ENFORCE_GT(min_sizes[i], 0,
"min_sizes[%d] must be positive.", i);
}
});
AddAttr<std::vector<int>>(
"max_sizes",
"(vector<int>) List of max sizes of generated prior boxes.");
AddAttr<std::vector<float>>( AddAttr<std::vector<float>>(
"aspect_ratios", "(vector<float>) ", "aspect_ratios",
"List of aspect ratios of generated prior boxes."); "(vector<float>) List of aspect ratios of generated prior boxes.");
AddAttr<std::vector<float>>( AddAttr<std::vector<float>>(
"variances", "(vector<float>) ", "variances",
"List of variances to be encoded in prior boxes."); "(vector<float>) List of variances to be encoded in prior boxes.")
AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.") .AddCustomChecker([](const std::vector<float>& variances) {
PADDLE_ENFORCE_EQ(variances.size(), 4,
"Must and only provide 4 variance.");
for (size_t i = 0; i < variances.size(); ++i) {
PADDLE_ENFORCE_GT(variances[i], 0.0,
"variance[%d] must be greater than 0.", i);
}
});
AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.") AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
.SetDefault(true); .SetDefault(true);
AddAttr<float>("step_w", AddAttr<float>("step_w",
"Prior boxes step across width, 0 for auto calculation.") "Prior boxes step across width, 0 for auto calculation.")
.SetDefault(0.0); .SetDefault(0.0)
.AddCustomChecker([](const float& step_w) {
PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
});
AddAttr<float>("step_h", AddAttr<float>("step_h",
"Prior boxes step across height, 0 for auto calculation.") "Prior boxes step across height, 0 for auto calculation.")
.SetDefault(0.0); .SetDefault(0.0)
.AddCustomChecker([](const float& step_h) {
PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
});
AddAttr<float>("offset", AddAttr<float>("offset",
"(float) " "(float) "
"Prior boxes center offset.") "Prior boxes center offset.")
......
...@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
std::vector<float>& output_aspect_ratior) { std::vector<float>& output_aspect_ratior) {
constexpr float epsilon = 1e-6; constexpr float epsilon = 1e-6;
output_aspect_ratior.clear(); output_aspect_ratior.clear();
output_aspect_ratior.push_back(1.); output_aspect_ratior.push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i]; float ar = input_aspect_ratior[i];
bool already_exist = false; bool already_exist = false;
...@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
if (!already_exist) { if (!already_exist) {
output_aspect_ratior.push_back(ar); output_aspect_ratior.push_back(ar);
if (flip) { if (flip) {
output_aspect_ratior.push_back(1. / ar); output_aspect_ratior.push_back(1.0f / ar);
} }
} }
} }
...@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
template <typename T> template <typename T>
struct ClipFunctor { struct ClipFunctor {
HOSTDEVICE T operator()(T in) const { HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.); return std::min<T>(std::max<T>(in, 0.), 1.);
} }
}; };
...@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
T inv_img_width = 1.0 / img_width;
T inv_img_height = 1.0 / img_height;
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
...@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
// first prior: aspect_ratio = 1, size = min_size // first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size; box_width = box_height = min_size;
// xmin // xmin
e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
...@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
// size = sqrt(min_size * max_size) // size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size * max_size); box_width = box_height = sqrt(min_size * max_size);
// xmin // xmin
e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; e_boxes(h, w, idx, 0) =
(center_x - box_width * 0.5) * inv_img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; e_boxes(h, w, idx, 2) =
(center_x + box_width * 0.5) * inv_img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
} }
...@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
box_width = min_size * sqrt(ar); box_width = min_size * sqrt(ar);
box_height = min_size / sqrt(ar); box_height = min_size / sqrt(ar);
// xmin // xmin
e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; e_boxes(h, w, idx, 0) =
(center_x - box_width * 0.5) * inv_img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; e_boxes(h, w, idx, 2) =
(center_x + box_width * 0.5) * inv_img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
} }
} }
......
...@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}; };
break; break;
default: default:
sorted_domain = "event end time"; sorted_domain = "event first end time";
} }
std::vector<std::vector<EventItem>> events_table; std::vector<std::vector<EventItem>> events_table;
......
...@@ -79,6 +79,7 @@ function run_build() { ...@@ -79,6 +79,7 @@ function run_build() {
Building in /paddle/build ... Building in /paddle/build ...
============================================ ============================================
EOF EOF
make clean
make -j `nproc` make -j `nproc`
} }
...@@ -116,7 +117,7 @@ EOF ...@@ -116,7 +117,7 @@ EOF
-DWITH_STYLE_CHECK=OFF -DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py
make -j `nproc` paddle_python make -j `nproc` paddle_python
make -j `nproc` paddle_docs paddle_docs_cn make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
make -j `nproc` print_operators_doc make -j `nproc` print_operators_doc
paddle/pybind/print_operators_doc > doc/en/html/operators.json paddle/pybind/print_operators_doc > doc/en/html/operators.json
popd popd
......
...@@ -9,13 +9,14 @@ cd $TRAVIS_BUILD_DIR/build ...@@ -9,13 +9,14 @@ cd $TRAVIS_BUILD_DIR/build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py
make -j `nproc` paddle_python make -j `nproc` paddle_python
make -j `nproc` paddle_docs paddle_docs_cn make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
make -j `nproc` print_operators_doc make -j `nproc` print_operators_doc
paddle/pybind/print_operators_doc > doc/en/html/operators.json paddle/pybind/print_operators_doc > doc/en/html/operators.json
# check websites for broken links # check websites for broken links
linkchecker doc/en/html/index.html linkchecker doc/en/html/index.html
linkchecker doc/cn/html/index.html linkchecker doc/cn/html/index.html
linkchecker doc/api/en/html/index.html
# Parse Github URL # Parse Github URL
REPO=`git config remote.origin.url` REPO=`git config remote.origin.url`
...@@ -54,10 +55,11 @@ function deploy_docs() { ...@@ -54,10 +55,11 @@ function deploy_docs() {
mkdir -p ${DIR} mkdir -p ${DIR}
# remove old docs. mv new docs. # remove old docs. mv new docs.
set +e set +e
rm -rf ${DIR}/doc ${DIR}/doc_cn rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
set -e set -e
cp -r ../doc/cn/html ${DIR}/doc_cn cp -r ../doc/cn/html ${DIR}/doc_cn
cp -r ../doc/en/html ${DIR}/doc cp -r ../doc/en/html ${DIR}/doc
cp -r ../doc/api/en/html ${DIR}/api_doc
git add . git add .
} }
......
...@@ -12,10 +12,202 @@ ...@@ -12,10 +12,202 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import re import re
from graphviz import GraphPreviewGenerator from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
_vartype2str_ = [
"UNK",
"LoDTensor",
"SelectedRows",
"FeedMinibatch",
"FetchList",
"StepScopes",
"LodRankTable",
"LoDTensorArray",
"PlaceList",
]
_dtype2str_ = [
"bool",
"int16",
"int32",
"int64",
"float16",
"float32",
"float64",
]
def repr_data_type(type):
return _dtype2str_[type]
def repr_tensor(proto):
return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
str(proto.dims))
reprtpl = "{ttype} {name} ({reprs})"
def repr_lodtensor(proto):
if not proto.lod_tensor: return
level = proto.lod_tensor.lod_level
reprs = repr_tensor(proto.lod_tensor.tensor)
return reprtpl.format(
ttype="LoDTensor" if level > 0 else "Tensor",
name=proto.name,
reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
def repr_selected_rows(proto):
if not proto.selected_rows: return
return reprtpl.format(
ttype="SelectedRows",
name=proto.name,
reprs=repr_tensor(proto.selected_rows))
def repr_tensor_array(proto):
if not proto.tensor_array: return
return reprtpl.format(
ttype="TensorArray",
name=proto.name,
reprs="level=%d, %s" % (proto.tensor_array.lod_level,
repr_tensor(proto.lod_tensor)))
type_handlers = [
repr_lodtensor,
repr_selected_rows,
repr_tensor_array,
]
def repr_var(vardesc):
for handler in type_handlers:
res = handler(vardesc)
if res:
return res
def pprint_program_codes(program_desc):
reprs = []
for block_idx in range(program_desc.num_blocks()):
block_desc = program_desc.block(block_idx)
block_repr = pprint_block_codes(block_desc)
reprs.append(block_repr)
return '\n'.join(reprs)
def pprint_block_codes(block_desc, show_backward=False):
def is_op_backward(op_desc):
if op_desc.type.endswith('_grad'): return True
def is_var_backward(var):
if "@GRAD" in var.parameter: return True
for arg in var.arguments:
if "@GRAD" in arg: return True
for var in op_desc.inputs:
if is_var_backward(var): return True
for var in op_desc.outputs:
if is_var_backward(var): return True
return False
def is_var_backward(var_desc):
return "@GRAD" in var_desc.name
if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string())
var_reprs = []
op_reprs = []
for var in block_desc.vars:
if not show_backward and is_var_backward(var):
continue
var_reprs.append(repr_var(var))
for op in block_desc.ops:
if not show_backward and is_op_backward(op): continue
op_reprs.append(repr_op(op))
tpl = "// block-{idx} parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
return tpl.format(
idx=block_desc.idx,
pidx=block_desc.parent_idx,
vars='\n'.join(var_reprs),
ops='\n'.join(op_reprs), )
def repr_attr(desc):
tpl = "{key}={value}"
valgetter = [
lambda attr: attr.i,
lambda attr: attr.f,
lambda attr: attr.s,
lambda attr: attr.ints,
lambda attr: attr.floats,
lambda attr: attr.strings,
lambda attr: attr.b,
lambda attr: attr.bools,
lambda attr: attr.block_idx,
lambda attr: attr.l,
]
key = desc.name
value = valgetter[desc.type](desc)
if key == "dtype":
value = repr_data_type(value)
return tpl.format(key=key, value=str(value)), (key, value)
def _repr_op_fill_constant(optype, inputs, outputs, attrs):
if optype == "fill_constant":
return "{output} = {data} [shape={shape}]".format(
output=','.join(outputs),
data=attrs['value'],
shape=str(attrs['shape']))
op_repr_handlers = [_repr_op_fill_constant, ]
def repr_op(opdesc):
optype = None
attrs = []
attr_dict = {}
is_target = None
inputs = []
outputs = []
tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
for var in opdesc.inputs:
key = var.parameter
value = args2value(var.arguments)
inputs.append("%s=%s" % (key, value))
for var in opdesc.outputs:
value = args2value(var.arguments)
outputs.append(value)
for attr in opdesc.attrs:
attr_repr, attr_pair = repr_attr(attr)
attrs.append(attr_repr)
attr_dict[attr_pair[0]] = attr_pair[1]
is_target = opdesc.is_target
for handler in op_repr_handlers:
res = handler(opdesc.type, inputs, outputs, attr_dict)
if res: return res
return tpl.format(
outputs=', '.join(outputs),
optype=opdesc.type,
inputs=', '.join(inputs),
attrs="{%s}" % ','.join(attrs),
is_target=", is_target" if is_target else "")
def draw_block_graphviz(block, highlights=None, path="./temp.dot"): def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
''' '''
......
...@@ -17,7 +17,9 @@ import contextlib ...@@ -17,7 +17,9 @@ import contextlib
from framework import Program, default_main_program from framework import Program, default_main_program
from . import core from . import core
__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope'] __all__ = [
'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
]
g_scope = core.Scope() g_scope = core.Scope()
...@@ -80,12 +82,12 @@ def has_feed_operators(block, feed_targets, feed_holder_name): ...@@ -80,12 +82,12 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
Args: Args:
block: a block instance (typically global block of a program) block: a block instance (typically global block of a program)
feed_targets: a dictionary of {feed_target_name: feed_target_data} feed_targets: a dictionary of {feed_target_name: feed_target_data}
feed_holder_name: the name of the variable that holds the data of feed_holder_name: the name of the variable that holds the data of
all feed targets. The type of this feed_holder variable is all feed targets. The type of this feed_holder variable is
FEED_MINIBATCH, which is essentially vector<LoDTensor>. FEED_MINIBATCH, which is essentially vector<LoDTensor>.
Returns: Returns:
A boolean value that indicates whether a block has feed operators A boolean value that indicates whether a block has feed operators
that match the info contained in feed_targets and feed_holder_name. that match the info contained in feed_targets and feed_holder_name.
""" """
...@@ -108,7 +110,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name): ...@@ -108,7 +110,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
def has_fetch_operators(block, fetch_targets, fetch_holder_name): def has_fetch_operators(block, fetch_targets, fetch_holder_name):
""" Check whether the block already has fetch operators. """ Check whether the block already has fetch operators.
Return false if the block does not have any fetch operators. Return false if the block does not have any fetch operators.
If some fetch operators have been appended to the block, check that If some fetch operators have been appended to the block, check that
the info contained in these fetch operators matches the fetch_targets the info contained in these fetch operators matches the fetch_targets
...@@ -118,13 +120,13 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -118,13 +120,13 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
Args: Args:
block: a block instance (typically global block of a program) block: a block instance (typically global block of a program)
fetch_targets: a dictionary of {fetch_target_name: fetch_target_data} fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
fetch_holder_name: the name of the variable that holds the data of fetch_holder_name: the name of the variable that holds the data of
all fetch targets. The type of this fetch_holder variable is all fetch targets. The type of this fetch_holder variable is
FETCH_LIST, which is essentially vector<LoDTensor>. FETCH_LIST, which is essentially vector<LoDTensor>.
Return: Return:
A boolean value that indicates whether a block has fetch operators A boolean value that indicates whether a block has fetch operators
that match the info contained in fetch_targets and fetch_holder_name. that match the info contained in fetch_targets and fetch_holder_name.
""" """
fetch_count = 0 fetch_count = 0
...@@ -146,6 +148,35 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -146,6 +148,35 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return fetch_count > 0 return fetch_count > 0
def fetch_var(name, scope=None, return_numpy=True):
"""
Fetch the value of the variable with the given name from the given scope
Args:
name(str): name of the variable. Typically, only persistable variables
can be found in the scope used for running the program.
scope(core.Scope|None): scope object. It should be the scope where
you pass to Executor.run() when running your program.
If None, global_scope() will be used.
return_numpy(bool): whether convert the tensor to numpy.ndarray
Returns:
LodTensor|numpy.ndarray
"""
assert isinstance(name, str)
if scope is None:
scope = global_scope()
assert isinstance(scope, core.Scope)
var = global_scope().find_var(name)
assert var is not None, (
"Cannot find " + name + " in scope. Perhaps you need to make the"
" variable persistable by using var.persistable = True in your"
" program.")
tensor = var.get_tensor()
if return_numpy:
tensor = as_numpy(tensor)
return tensor
class Executor(object): class Executor(object):
def __init__(self, places): def __init__(self, places):
if not isinstance(places, list) and not isinstance(places, tuple): if not isinstance(places, list) and not isinstance(places, tuple):
......
...@@ -31,6 +31,7 @@ __all__ = [ ...@@ -31,6 +31,7 @@ __all__ = [
'program_guard', 'program_guard',
'switch_startup_program', 'switch_startup_program',
'switch_main_program', 'switch_main_program',
'get_var',
] ]
EMPTY_VAR_NAME = core.kEmptyVarName() EMPTY_VAR_NAME = core.kEmptyVarName()
...@@ -1123,3 +1124,22 @@ def program_guard(main_program, startup_program=None): ...@@ -1123,3 +1124,22 @@ def program_guard(main_program, startup_program=None):
switch_main_program(main_program) switch_main_program(main_program)
if startup_program is not None: if startup_program is not None:
switch_startup_program(startup_program) switch_startup_program(startup_program)
def get_var(name, program=None):
"""
Get a variable by name from the global block of a program
Args:
name(str): name of the variable
program(Program|None): program object.
If None, default_global_program() will be used.
Returns:
Variable
"""
if program is None:
program = default_main_program()
assert isinstance(name, str)
assert isinstance(name, Program)
return program.global_block().var(name)
...@@ -1231,10 +1231,17 @@ def conv2d(input, ...@@ -1231,10 +1231,17 @@ def conv2d(input,
""" """
if stride is None: if stride is None:
stride = [1, 1] stride = [1, 1]
helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype()
num_channels = input.shape[1] num_channels = input.shape[1]
l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn):
l_type = 'depthwise_conv2d'
helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype()
if groups is None: if groups is None:
num_filter_channels = num_channels num_filter_channels = num_channels
else: else:
...@@ -1267,7 +1274,7 @@ def conv2d(input, ...@@ -1267,7 +1274,7 @@ def conv2d(input,
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type='conv2d', type=l_type,
inputs={ inputs={
'Input': input, 'Input': input,
'Filter': filter_param, 'Filter': filter_param,
......
...@@ -35,13 +35,15 @@ __all__ = [ ...@@ -35,13 +35,15 @@ __all__ = [
] ]
def create_tensor(dtype, name=None): def create_tensor(dtype, name=None, persistable=False):
helper = LayerHelper("create_tensor", **locals()) helper = LayerHelper("create_tensor", **locals())
return helper.create_variable(name=helper.name, dtype=dtype) return helper.create_variable(
name=helper.name, dtype=dtype, persistable=persistable)
def create_parameter(shape, def create_parameter(shape,
dtype, dtype,
name=None,
attr=None, attr=None,
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None):
...@@ -62,7 +64,7 @@ def create_parameter(shape, ...@@ -62,7 +64,7 @@ def create_parameter(shape,
""" """
helper = LayerHelper("create_parameter", **locals()) helper = LayerHelper("create_parameter", **locals())
if attr is None: if attr is None:
attr = ParamAttr() attr = ParamAttr(name=name)
return helper.create_parameter(attr, shape, dtype, is_bias, return helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer) default_initializer)
......
...@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None): ...@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
core.enable_profiler(prof_state) core.enable_profiler(prof_state)
yield yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = { key_map = {
'default': core.EventSortingKey.kDefault, 'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls, 'calls': core.EventSortingKey.kCalls,
......
...@@ -166,7 +166,9 @@ def infer(use_cuda, save_dirname=None): ...@@ -166,7 +166,9 @@ def infer(use_cuda, save_dirname=None):
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension of conv should be 4-D or 5-D. # The input's dimension of conv should be 4-D or 5-D.
tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32") # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
tensor_img = numpy.random.uniform(-1.0, 1.0,
[1, 1, 28, 28]).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
......
...@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1): ...@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1):
self.op_type = "conv2d" self.op_type = "conv2d"
class TestDepthwiseConv(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
class TestDepthwiseConv2(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
self.groups = 3
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
self.op_type = "depthwise_conv2d"
# cudnn v5 does not support dilation conv. # cudnn v5 does not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
# def init_op_type(self): # def init_op_type(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
import op_test
import numpy
import unittest
class TestFetchVar(op_test.OpTest):
def test_fetch_var(self):
val = numpy.array([1, 3, 5]).astype(numpy.int32)
x = layers.create_tensor(dtype="int32", persistable=True, name="x")
layers.assign(input=val, output=x)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
fetched_x = fluid.fetch_var("x")
self.assertTrue(
numpy.array_equal(fetched_x, val),
"fetch_x=%s val=%s" % (fetched_x, val))
self.assertEqual(fetched_x.dtype, val.dtype)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册