提交 b0904794 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

...@@ -35,8 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -35,8 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add( ExternalProject_Add(
extern_anakin extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin" DEPENDS ${MKLML_PROJECT}
GIT_TAG "04256ba78fa3da0beb74e8036c8efd68c12824d6" # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES CMAKE_ARGS -DUSE_GPU_PLACE=YES
......
...@@ -115,6 +115,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) ...@@ -115,6 +115,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc) # cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
cc_test(rw_lock_test SRCS rw_lock_test.cc)
# disable test temporarily. # disable test temporarily.
# TODO https://github.com/PaddlePaddle/Paddle/issues/11971 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <array>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -12,7 +12,11 @@ ...@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <stack> #include <stack>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
......
...@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const { ...@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
} }
void OpDesc::SetAttr(const std::string &name, const Attribute &v) { void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
// NOTICE(minqiyang): pybind11 will take the empty list in python as
// the std::vector<int> type in C++; so we have to change the attr's type
// here if we meet this issue
proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
if (attr_type == proto::AttrType::INTS &&
boost::get<std::vector<int>>(v).size() == 0u) {
// Find current attr via attr name and set the correct attribute value
const proto::OpProto::Attr &attr = GetProtoAttr(name);
switch (attr.type()) {
case proto::AttrType::BOOLEANS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
this->attrs_[name] = std::vector<bool>();
break;
}
case proto::AttrType::INTS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
this->attrs_[name] = std::vector<int>();
break;
}
case proto::AttrType::FLOATS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
this->attrs_[name] = std::vector<float>();
break;
}
case proto::AttrType::STRINGS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
this->attrs_[name] = std::vector<std::string>();
break;
}
case proto::AttrType::BLOCKS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
this->SetBlocksAttr(name, std::vector<BlockDesc *>());
return;
}
default:
PADDLE_THROW("Wrong attr type %d", attr.type());
}
need_update_ = true;
return;
}
this->attrs_[name] = v; this->attrs_[name] = v;
need_update_ = true; need_update_ = true;
} }
...@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const { ...@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
return it->second; return it->second;
} }
const proto::OpProto::Attr &OpDesc::GetProtoAttr(
const std::string &name) const {
const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return attr;
}
}
PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
}
Attribute OpDesc::GetNullableAttr(const std::string &name) const { Attribute OpDesc::GetNullableAttr(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
if (it != attrs_.end()) { if (it != attrs_.end()) {
......
...@@ -81,6 +81,8 @@ class OpDesc { ...@@ -81,6 +81,8 @@ class OpDesc {
Attribute GetAttr(const std::string &name) const; Attribute GetAttr(const std::string &name) const;
const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
Attribute GetNullableAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttrId(const std::string &name) const; int GetBlockAttrId(const std::string &name) const;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <pthread.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
struct RWLock {
RWLock() { pthread_rwlock_init(&lock_, nullptr); }
~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed");
}
void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed");
}
void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
}
private:
pthread_rwlock_t lock_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/rw_lock.h"
#include <gtest/gtest.h>
#include <chrono> // NOLINT
#include <thread> // NOLINT
#include <vector>
namespace f = paddle::framework;
void f1(f::RWLock *lock) {
lock->RDLock();
lock->UNLock();
}
TEST(RWLOCK, read_read) {
f::RWLock lock;
lock.RDLock();
std::thread t1(f1, &lock);
std::thread t2(f1, &lock);
t1.join();
t2.join();
lock.UNLock();
}
void f2(f::RWLock *lock, std::vector<int> *result) {
lock->RDLock();
ASSERT_EQ(result->size(), 0UL);
lock->UNLock();
}
void f3(f::RWLock *lock, std::vector<int> *result) {
lock->WRLock();
result->push_back(1);
lock->UNLock();
}
TEST(RWLOCK, read_write) {
f::RWLock lock;
std::vector<int> result;
lock.RDLock();
std::thread t1(f2, &lock, &result);
t1.join();
std::thread t2(f3, &lock, &result);
std::this_thread::sleep_for(std::chrono::seconds(1));
ASSERT_EQ(result.size(), 0UL);
lock.UNLock();
t2.join();
ASSERT_EQ(result.size(), 1UL);
}
void f4(f::RWLock *lock, std::vector<int> *result) {
lock->RDLock();
ASSERT_EQ(result->size(), 1UL);
lock->UNLock();
}
TEST(RWLOCK, write_read) {
f::RWLock lock;
std::vector<int> result;
lock.WRLock();
std::thread t1(f4, &lock, &result);
std::this_thread::sleep_for(std::chrono::seconds(1));
result.push_back(1);
lock.UNLock();
t1.join();
}
...@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const { ...@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
: true; : true;
} }
std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get( int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
const std::vector<int64_t>& keys, framework::Tensor* value) const { rwlock_->RDLock();
auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) {
rwlock_->UNLock();
if (!auto_grown) {
PADDLE_THROW("key %d not found", key);
}
rwlock_->WRLock();
auto map_size = id_to_index_.size();
auto vector_size = rows_.size();
if (map_size != vector_size) {
rwlock_->UNLock();
PADDLE_THROW(
"id_to_index_ size %d should have the same size with rows_ %d",
map_size, vector_size);
}
auto write_iter = id_to_index_.find(key);
if (write_iter == id_to_index_.end()) {
size_t row_num = rows_.size();
if (row_num == value_->dims()[0]) {
rwlock_->UNLock();
PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
}
// key logic to put a key into id_to_index_
rows_.push_back(key);
auto index = static_cast<int64_t>(rows_.size() - 1);
id_to_index_[key] = index;
rwlock_->UNLock();
return index;
} else {
auto index = write_iter->second;
rwlock_->UNLock();
return index;
}
} else {
auto index = iter->second;
rwlock_->UNLock();
return index;
}
}
void SelectedRows::SyncIndex() {
rwlock_->WRLock();
id_to_index_.clear();
for (size_t i = 0; i < rows_.size(); ++i) {
id_to_index_[rows_[i]] = i;
}
rwlock_->UNLock();
}
void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
bool auto_grown) {
PADDLE_ENFORCE(value->IsInitialized(), PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized."); "The value tensor should be initialized.");
std::vector<std::pair<int64_t, int64_t>> non_keys_pair; if (ids.numel() == 0) {
if (keys.empty()) {
VLOG(3) << "keys is empty, please check data!"; VLOG(3) << "keys is empty, please check data!";
} else { } else {
int64_t value_width = value_->numel() / value_->dims()[0]; int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table " "output tensor should have the same shape with table "
"except the dims[0]."); "except the dims[0].");
for (size_t i = 0; i < ids.numel(); ++i) {
for (size_t i = 0; i < keys.size(); ++i) { int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
int64_t index = Index(keys[i]);
if (index == -1) {
non_keys_pair.push_back(
std::make_pair(keys[i], static_cast<int64_t>(i)));
} else {
framework::VisitDataType( framework::VisitDataType(
framework::ToDataType(value_->type()), framework::ToDataType(value_->type()),
TensorCopyVisitor(value, i * value_width, *value_.get(), TensorCopyVisitor(value, i * value_width, *value_.get(),
index * value_width, value_width)); index * value_width, value_width));
} }
} }
}
return non_keys_pair;
}
bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
if (value_->IsInitialized()) {
PADDLE_ENFORCE_EQ(
value.type(), value_->type(),
"The type of the value should be same with the original value");
}
PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
"The first dim of value should be 1.");
std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
auto index = Index(key);
bool is_new_key = false;
if (index == -1) {
rows_.push_back(key);
index = rows_.size() - 1;
is_new_key = true;
// whether need to resize the table
if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
auto dims = value_->dims();
dims[0] = (dims[0] + 1) << 1;
framework::VisitDataType(framework::ToDataType(value.type()),
ReAllocateVisitor(dims, value_.get()));
}
}
framework::VisitDataType(
framework::ToDataType(value.type()),
TensorCopyVisitor(value_.get(),
index * value_->numel() / value_->dims()[0], value,
static_cast<int64_t>(0), value.numel()));
return is_new_key;
} }
} // namespace framework } // namespace framework
......
...@@ -17,10 +17,12 @@ limitations under the License. */ ...@@ -17,10 +17,12 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
...@@ -48,13 +50,13 @@ class SelectedRows { ...@@ -48,13 +50,13 @@ class SelectedRows {
SelectedRows(const std::vector<int64_t>& rows, const int64_t& height) SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
: rows_(rows), height_(height) { : rows_(rows), height_(height) {
value_.reset(new Tensor()); value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex); rwlock_.reset(new RWLock);
} }
SelectedRows() { SelectedRows() {
height_ = 0; height_ = 0;
value_.reset(new Tensor()); value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex); rwlock_.reset(new RWLock);
} }
platform::Place place() const { return value_->place(); } platform::Place place() const { return value_->place(); }
...@@ -74,47 +76,51 @@ class SelectedRows { ...@@ -74,47 +76,51 @@ class SelectedRows {
void set_rows(const Vector<int64_t>& rows) { rows_ = rows; } void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
/* /*
* @brief wheter has the specified key in the table. * @brief Get the index of key in rows
*
* @return -1 if the key does not exists.
*/
int64_t Index(int64_t key) const {
auto it = std::find(rows_.begin(), rows_.end(), key);
if (it == rows_.end()) {
PADDLE_THROW("id %s not in table", key);
}
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
/*
* @brief whether has the specified key in the table.
* *
* @return true if the key is exists. * @return true if the key is exists.
*/ */
bool HasKey(int64_t key) const; bool HasKey(int64_t key) const;
/* /*
* @brief Get value by the key list, if the * @brief Get value by the key list.
* Note!!! this interface is only used when selected_rows is used as
* parameters
* for distribute lookup table.
* *
* @return a list of pair which contains the non-exists key and the index in * @return a list of pair which contains the non-exists key and the index in
* the value * the value
*/ */
std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys, void Get(const framework::Tensor& ids, framework::Tensor* value,
framework::Tensor* value) const; bool auto_grown = false);
/* /*
* @brief Set a key-value pair into the table. * @brief Get the index of the key from id_to_index_ map. If the key not
* This function will double the value memory if it's not engouth. * exist,
* add the key into id_to_index_.
* *
* @note: * Note!!! this interface is only used when selected_rows is used as
* 1. The first dim of the value should be 1 * parameters
* 2. The value should be initialized and the data type * for distribute lookup table.
* should be the same with the table.
*
* @return true if the key is a new one, otherwise false
* *
* @return index of the key.
*/ */
bool Set(int64_t key, const Tensor& value); int64_t AutoGrownIndex(int64_t key, bool auto_grown);
/* void SyncIndex();
* @brief Get the index of key in rows
*
* @return -1 if the key does not exists.
*/
int64_t Index(int64_t key) const {
auto it = std::find(rows_.begin(), rows_.end(), key);
if (it == rows_.end()) {
return static_cast<int64_t>(-1);
}
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
DDim GetCompleteDims() const { DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims()); std::vector<int64_t> dims = vectorize(value_->dims());
...@@ -127,9 +133,10 @@ class SelectedRows { ...@@ -127,9 +133,10 @@ class SelectedRows {
// SelectedRows are simply concated when adding together. Until a // SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled. // SelectedRows add a Tensor, will the duplicate rows be handled.
Vector<int64_t> rows_; Vector<int64_t> rows_;
std::unordered_map<int64_t, int64_t> id_to_index_;
std::unique_ptr<Tensor> value_{nullptr}; std::unique_ptr<Tensor> value_{nullptr};
int64_t height_; int64_t height_;
std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr}; std::unique_ptr<RWLock> rwlock_{nullptr};
}; };
/* /*
......
...@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include <time.h>
#include <thread> // NOLINT
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
} }
TEST_F(SelectedRowsTester, SparseTable) { TEST(SelectedRows, SparseTable) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
SelectedRows table; SelectedRows table;
int64_t table_size = 100;
int64_t embedding_width = 8;
// initialize a sparse table // initialize a sparse table
table.mutable_value()->Resize(framework::make_ddim({1, 100})); table.mutable_value()->Resize(
table.mutable_value()->mutable_data<float>(cpu); framework::make_ddim({table_size, embedding_width}));
table.mutable_rows()->push_back(1); auto* data = table.mutable_value()->mutable_data<float>(cpu);
for (int64_t i = 0; i < table_size; ++i) {
for (int64_t j = 0; j < embedding_width; ++j) {
data[i * embedding_width + j] = static_cast<float>(i);
}
}
ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
ASSERT_TRUE(table.HasKey(10));
ASSERT_TRUE(table.HasKey(8));
ASSERT_TRUE(table.HasKey(6));
ASSERT_EQ(table.rows().size(), 3);
framework::Tensor ids;
ids.Resize(framework::make_ddim({4}));
auto* ids_data = ids.mutable_data<int64_t>(cpu);
ids_data[0] = static_cast<int64_t>(6);
ids_data[1] = static_cast<int64_t>(6);
ids_data[2] = static_cast<int64_t>(8);
ids_data[3] = static_cast<int64_t>(10);
int64_t key = 10000; framework::Tensor get_value;
int64_t non_key = 999; auto* value_data = get_value.mutable_data<float>(
framework::Tensor value; framework::make_ddim({4, embedding_width}), cpu);
value.Resize(framework::make_ddim({1, 100})); table.Get(ids, &get_value);
auto ptr = value.mutable_data<float>(cpu);
ptr[0] = static_cast<float>(10);
ASSERT_EQ(table.rows().size(), static_cast<size_t>(1)); for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(table.HasKey(key), false); ASSERT_EQ(value_data[0 * embedding_width + j], 2);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[1 * embedding_width + j], 2);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[2 * embedding_width + j], 1);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[3 * embedding_width + j], 0);
}
}
table.Set(key, value); void f1(SelectedRows* table, int table_size) {
for (int i = 1000000; i > 0; --i) {
auto id = i % table_size;
int64_t index1 = table->AutoGrownIndex(id, true);
int64_t index2 = table->AutoGrownIndex(id, false);
int64_t index3 = table->AutoGrownIndex(id, true);
ASSERT_EQ(index1, index2);
ASSERT_EQ(index2, index3);
}
}
ASSERT_EQ(table.rows().size(), static_cast<size_t>(2)); void f2(SelectedRows* table, int table_size) {
ASSERT_EQ(table.HasKey(key), true); for (int i = 0; i < 1000000; ++i) {
// check re-allocate auto id = i % table_size;
ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4)); int64_t index1 = table->AutoGrownIndex(id, true);
int64_t index2 = table->AutoGrownIndex(id, false);
int64_t index3 = table->AutoGrownIndex(id, true);
ASSERT_EQ(index1, index2);
ASSERT_EQ(index2, index3);
}
}
framework::Tensor get_value; void f3(SelectedRows* table, int table_size) {
get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu); clock_t t1 = clock();
std::vector<int64_t> keys({non_key, key}); for (int i = 100000; i > 0; --i) {
auto non_key_pairs = table.Get(keys, &get_value); auto id1 = table->AutoGrownIndex(i % table_size, true);
auto id2 = table->Index(i % table_size);
ASSERT_EQ(id1, id2);
}
clock_t t2 = clock();
std::cout << "f3 run time:" << t2 - t1 << std::endl;
}
void f4(SelectedRows* table, int table_size) {
clock_t t1 = clock();
for (int i = 0; i < 100000; ++i) {
auto id1 = table->AutoGrownIndex(i % table_size, true);
auto id2 = table->Index(i % table_size);
ASSERT_EQ(id1, id2);
}
clock_t t2 = clock();
std::cout << "f4 run time:" << t2 - t1 << std::endl;
}
TEST(SelectedRows, MultiThreadAutoIndex) {
platform::CPUPlace cpu;
SelectedRows table;
int64_t table_size = 100000;
int64_t embedding_width = 8;
// initialize a sparse table
table.mutable_value()->Resize(
framework::make_ddim({table_size, embedding_width}));
auto* data = table.mutable_value()->mutable_data<float>(cpu);
for (int64_t i = 0; i < table_size; ++i) {
for (int64_t j = 0; j < embedding_width; ++j) {
data[i * embedding_width + j] = static_cast<float>(i);
}
}
ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10)); std::thread t1(f1, &table, table_size);
ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1)); std::thread t11(f1, &table, table_size);
ASSERT_EQ(non_key_pairs[0].first, non_key); std::thread t2(f2, &table, table_size);
std::thread t22(f2, &table, table_size);
t1.join();
t11.join();
t2.join();
t22.join();
std::thread t3(f3, &table, table_size);
std::thread t4(f4, &table, table_size);
t3.join();
t4.join();
} }
} // namespace framework } // namespace framework
......
...@@ -8,7 +8,7 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph ...@@ -8,7 +8,7 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
helper.cc helper.cc
model_store_pass.cc model_store_pass.cc
DEPS framework_proto proto_desc) DEPS framework_proto proto_desc)
cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
......
...@@ -20,17 +20,6 @@ namespace paddle { ...@@ -20,17 +20,6 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
template <>
std::string &NodeAttr::As<std::string>() {
if (data_.empty()) {
type_index_ = std::type_index(typeid(std::string));
}
PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
return data_;
}
std::string &NodeAttr::String() { return As<std::string>(); }
std::vector<Dot::Attr> Value::dot_attrs() const { std::vector<Dot::Attr> Value::dot_attrs() const {
return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"), return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
Dot::Attr("shape", "box"), Dot::Attr("shape", "box"),
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/device.h" #include "paddle/fluid/inference/analysis/device.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -38,39 +39,35 @@ class NodeMap; ...@@ -38,39 +39,35 @@ class NodeMap;
// A helper class to maintain the status from Pass. // A helper class to maintain the status from Pass.
struct NodeAttr { struct NodeAttr {
using any_t =
boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
// NOTE T should be a primary type or a struct combined by several primary // NOTE T should be a primary type or a struct combined by several primary
// types. // types.
// NOTE the STL containers should not use here. // NOTE the STL containers should not use here.
// Some usages // Some usages
// Attr attr; // Attr attr;
// attr.Bool() = true; // attr.Bool() = true;
bool &Bool() { return As<bool>(); } bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); } float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); } int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); } int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); } void *&Pointer() { return As<void *>(); }
std::string &String(); std::string &String() { return As<std::string>(); }
private: private:
template <typename T> template <typename T>
T &As() { T &As() {
// init storage in the first usage. if (type_index_ == typeid(NodeAttr)) {
if (data_.empty()) { type_index_ = typeid(T);
VLOG(4) << "resize data to " << sizeof(T); any_data_ = T();
type_index_ = std::type_index(typeid(T)); } else {
data_.resize(sizeof(T)); PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
} }
PADDLE_ENFORCE(framework::IsType<T>(type_index_), return boost::get<T>(any_data_);
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_index_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
} }
private: private:
std::string data_; any_t any_data_;
std::type_index type_index_{typeid(NodeAttr)}; std::type_index type_index_{typeid(NodeAttr)};
}; };
......
...@@ -20,6 +20,24 @@ namespace paddle { ...@@ -20,6 +20,24 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST(NodeAttr, bool) {
NodeAttr x;
x.Bool() = true;
ASSERT_EQ(x.Bool(), true);
}
TEST(NodeAttr, int32) {
NodeAttr x;
x.Int32() = 32;
ASSERT_EQ(x.Int32(), 32);
}
TEST(NodeAttr, string) {
NodeAttr x;
x.String() = "Hello";
ASSERT_EQ(x.String(), "Hello");
}
TEST(Node, Attr) { TEST(Node, Attr) {
// Node is an abstract class, use Value instead for they share the same Attr // Node is an abstract class, use Value instead for they share the same Attr
// logic. // logic.
...@@ -27,6 +45,9 @@ TEST(Node, Attr) { ...@@ -27,6 +45,9 @@ TEST(Node, Attr) {
auto* node = nodes.Create(Node::Type::kValue); auto* node = nodes.Create(Node::Type::kValue);
node->attr("v0").Int32() = 2008; node->attr("v0").Int32() = 2008;
ASSERT_EQ(node->attr("v0").Int32(), 2008); ASSERT_EQ(node->attr("v0").Int32(), 2008);
node->attr("str").String() = "hello world";
ASSERT_EQ(node->attr("str").String(), "hello world");
} }
} // namespace analysis } // namespace analysis
......
...@@ -13,16 +13,22 @@ else ...@@ -13,16 +13,22 @@ else
use_gpu_list='false' use_gpu_list='false'
fi fi
PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
# download vis_demo data # download vis_demo data
function download() { function download() {
dir_name=$1 dir_name=$1
mkdir -p $dir_name mkdir -p $dir_name
cd $dir_name cd $dir_name
if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
else
wget -q ${URL_ROOT}$dir_name.tar.gz wget -q ${URL_ROOT}$dir_name.tar.gz
tar xzf *.tar.gz tar xzf *.tar.gz
fi
cd .. cd ..
} }
URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
mkdir -p data mkdir -p data
cd data cd data
vis_demo_list='se_resnext50 ocr mobilenet' vis_demo_list='se_resnext50 ocr mobilenet'
......
...@@ -26,8 +26,6 @@ namespace plat = paddle::platform; ...@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \ act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \ ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \ ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>, \ ops::grad_functor<double>>);
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
...@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> { ...@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = static_cast<T>(0.5) * dout / out; const Out out_conj = Eigen::numext::conj(out);
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
} }
}; };
...@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> { ...@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) * dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor) - static_cast<T>(1)); x.pow(static_cast<T>(factor - static_cast<T>(1)));
} }
}; };
...@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
T b = static_cast<T>(beta);
auto temp1 = static_cast<T>(1) / auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp()); (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (b * out)); auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((b * out) + temp2); dx.device(d) = dout * ((beta * out) + temp2);
} }
}; };
......
...@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>, REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
ops::AssignValueKernel<float>, ops::AssignValueKernel<float>);
ops::AssignValueKernel<plat::float16>);
...@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType; ...@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024; static_cast<size_t>(1024) * 1024 * 1024;
template <typename T, typename DeviceContext>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool EnableFp16(const DeviceContext& dev_ctx,
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
return true;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
return false;
}
template <typename T> template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t algo; cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
} else {
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo)); workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
} }
#endif
// get workspace size able to allocate // get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
...@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else { } else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} }
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
...@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else { } else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
} }
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
CUDNN_ENFORCE( CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
...@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, ...@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>); paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>, paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
...@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, ...@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>); paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>, paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
...@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
using CUDACtx = paddle::platform::CUDADeviceContext; using CUDACtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cross_entropy, REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>, ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>, ops::CrossEntropyOpKernel<CUDACtx, double>);
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>); REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
REGISTER_OP_CUDA_KERNEL( ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>, ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
...@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, ...@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) { int64_t rows_numel) {
CreateVarsOnScope(scope, place); CreateVarsOnScope(scope, place);
auto w = scope->Var("w")->GetMutable<framework::SelectedRows>(); auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
auto rows = w->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
auto w_value = w->mutable_value(); auto w_value = w->mutable_value();
w_value->Resize({rows_numel, 10}); w_value->Resize({rows_numel, 10});
for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
auto ptr = w_value->mutable_data<float>(*place); auto ptr = w_value->mutable_data<float>(*place);
......
...@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>, ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -14,24 +14,19 @@ limitations under the License. */ ...@@ -14,24 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad, elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>); int64_t>);
...@@ -14,25 +14,19 @@ limitations under the License. */ ...@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul, elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
int64_t>); int64_t>);
...@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int j = blockIdx.x; int j = blockIdx.x;
int i = threadIdx.x; int i = threadIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
T val(0); T val = 0;
do { do {
int x_offset = i * w + j; int x_offset = i * w + j;
...@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x; int tid = threadIdx.x;
int j = blockIdx.x; int j = blockIdx.x;
T val(0); T val = 0;
int ttid = tid; int ttid = tid;
while (true) { while (true) {
......
...@@ -14,25 +14,19 @@ limitations under the License. */ ...@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
int64_t>); int64_t>);
...@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fill_constant_op.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class FillConstantOp : public framework::OperatorWithKernel { class FillConstantInferShape : public framework::InferShapeBase {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; void operator()(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null."); "Output(Out) of FillConstantOp should not be null.");
auto& shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape)); ctx->SetOutputDim("Out", framework::make_ddim(shape));
} }
};
class FillConstantOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
auto value = Attr<float>("value");
auto force_cpu = Attr<bool>("force_cpu");
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out.mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
}
framework::OpKernelType GetExpectedKernelType( platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
const framework::ExecutionContext& ctx) const override { auto &dev_ctx = *pool.Get(dev_place);
return framework::OpKernelType( math::set_constant(dev_ctx, &out, value);
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context());
} }
}; };
...@@ -67,11 +87,6 @@ Fill up a variable with specified constant value. ...@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillConstantOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto value = ctx.Attr<float>("value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto* out = ctx.Output<framework::Tensor>("Out");
out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out->mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
}
math::set_constant(ctx.template device_context<DeviceContext>(), out,
value);
}
};
} // namespace operators
} // namespace paddle
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase { ...@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
framework::VisitDataType( framework::VisitDataType(
dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value"))); dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
if (!force_cpu && platform::is_gpu_place(place)) { if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out // Copy tensor to out
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(gaussian_random, REGISTER_OP_CUDA_KERNEL(gaussian_random,
paddle::operators::GPUGaussianRandomKernel<float>, paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>); paddle::operators::GPUGaussianRandomKernel<double>);
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase { ...@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
auto out_var = scope.FindVar(Output("Out")); auto out_var = scope.FindVar(Output("Out"));
auto w_var = scope.FindVar(Input("W")); auto w_var = scope.FindVar(Input("W"));
auto ids_var = scope.FindVar(Input("Ids")); auto ids_var = scope.FindVar(Input("Ids"));
unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
float min = Attr<float>("min");
float max = Attr<float>("max");
bool auto_grown_table = Attr<bool>("auto_grown_table");
PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
"The type of Out var should be LodTensor."); "The type of Out var should be LodTensor.");
...@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase { ...@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
auto &ids_t = ids_var->Get<framework::LoDTensor>(); auto &ids_t = ids_var->Get<framework::LoDTensor>();
auto out_t = out_var->GetMutable<framework::LoDTensor>(); auto out_t = out_var->GetMutable<framework::LoDTensor>();
auto w_t = w_var->GetMutable<framework::SelectedRows>(); auto w_t = w_var->GetMutable<framework::SelectedRows>();
std::vector<int64_t> keys;
keys.resize(ids_t.numel());
for (int64_t i = 0; i < ids_t.numel(); ++i) {
keys[i] = ids_t.data<int64_t>()[i];
}
// TODO(Yancey1989): support CUDA Place for the sparse table // TODO(Yancey1989): support CUDA Place for the sparse table
platform::CPUPlace cpu; platform::CPUPlace cpu;
auto out_shape = w_t->value().dims(); auto out_shape = w_t->value().dims();
out_shape[0] = keys.size(); out_shape[0] = ids_t.numel();
out_t->Resize(out_shape); out_t->Resize(out_shape);
out_t->mutable_data(cpu, w_t->value().type()); out_t->mutable_data(cpu, w_t->value().type());
PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
framework::proto::VarType::FP32, framework::proto::VarType::FP32,
"The sparse table only support FP32"); "The sparse table only support FP32");
auto non_keys_pair = w_t->Get(keys, out_t); w_t->Get(ids_t, out_t, true);
if (!auto_grown_table) {
PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
"there is some keys does exists in the sparse table.");
}
auto value_shape = w_t->value().dims();
value_shape[0] = 1;
for (const auto &it : non_keys_pair) {
const auto key = it.first;
const auto index = it.second;
framework::Tensor value;
value.Resize(value_shape);
auto data = value.mutable_data<float>(cpu);
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<float> dist(min, max);
int64_t size = value.numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
w_t->Set(key, value);
memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
cpu, value.data<float>(), value.numel() * sizeof(float));
}
} }
}; };
...@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise the given value indicates padding the output " "Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.") "with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding); .SetDefault(kNoPadding);
AddAttr<float>("min",
"(float, default -1.0) "
"Minimum value of uniform random")
.SetDefault(-1.0f);
AddAttr<float>("max",
"(float, default 1.0) "
"Maximum value of uniform random")
.SetDefault(1.0f);
AddAttr<int>("seed",
"(int, default 0) "
"Random seed used for generating samples. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.")
.SetDefault(0);
AddAttr<bool>("auto_grown_table", AddAttr<bool>("auto_grown_table",
"(bool default false)" "(bool default false)"
"Whether create new value if for nonexistent key.") "Whether create new value if for nonexistent key.")
......
...@@ -15,25 +15,11 @@ limitations under the License. */ ...@@ -15,25 +15,11 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
HOSTDEVICE T log(const T& val) {
return std::log(val);
}
template <>
HOSTDEVICE platform::float16 log(const platform::float16& val) {
// strage bug, hlog is not exists.
return static_cast<float16>(0);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
...@@ -49,12 +35,12 @@ template <typename T> ...@@ -49,12 +35,12 @@ template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) { const int class_num) {
int tid = threadIdx.x; int tid = threadIdx.x;
T val(0); T val = 0;
int idx = blockIdx.x * class_num + tid; int idx = blockIdx.x * class_num + tid;
int end = blockIdx.x * class_num + class_num; int end = blockIdx.x * class_num + class_num;
for (; idx < end; idx += blockDim.x) { for (; idx < end; idx += blockDim.x) {
val += math::TolerableValue<T>()(log(X[idx])) * label[idx]; val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
} }
val = paddle::platform::reduceSum(val, tid, blockDim.x); val = paddle::platform::reduceSum(val, tid, blockDim.x);
...@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>; template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>; template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
namespace paddle { namespace paddle {
...@@ -35,21 +33,6 @@ struct TolerableValue { ...@@ -35,21 +33,6 @@ struct TolerableValue {
} }
}; };
// float16 value clip behave different.
using paddle::platform::float16;
using paddle::platform::isfinite;
template <>
struct TolerableValue<float16> {
HOSTDEVICE float16 operator()(const float16& x) const {
if (isfinite(x))
return x;
else if (x > static_cast<float16>(0))
return std::numeric_limits<float16>::max();
else
return std::numeric_limits<float16>::min();
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class CrossEntropyFunctor { class CrossEntropyFunctor {
public: public:
......
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template struct SelectedRowsAdd<platform::CUDADeviceContext, float>; template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, double>; template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
namespace { namespace {
template <typename T, int block_size> template <typename T, int block_size>
...@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto* out_data = output->data<T>(); auto* out_data = output->data<T>();
SetConstant<platform::CUDADeviceContext, T> functor; SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0)); functor(context, output, 0.0);
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
...@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>; template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>; template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;
template <typename T> template <typename T>
struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
...@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>; ...@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>; template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>; template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>; template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext,
platform::float16>;
namespace { namespace {
template <typename T, int block_size> template <typename T, int block_size>
...@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>; ...@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
namespace scatter { namespace scatter {
...@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context.GetPlace()); context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor; math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), static_cast<T>(0)); constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<T>(); auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>(); auto* input_data = input.value().data<T>();
...@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>; ...@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template struct MergeAdd<platform::CUDADeviceContext, double>; template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>; template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>; template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template <typename T, int block_size> template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows, __global__ void UpdateToTensorKernel(const T* selected_rows,
......
...@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()( ...@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template class SoftmaxCUDNNFunctor<platform::float16>; template class SoftmaxCUDNNFunctor<platform::float16>;
template class SoftmaxCUDNNFunctor<float>; template class SoftmaxCUDNNFunctor<float>;
template class SoftmaxCUDNNFunctor<double>; template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<platform::float16>;
template class SoftmaxGradCUDNNFunctor<float>; template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>; template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>; template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>; template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>; template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::float16>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
......
...@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h" #include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>, mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanKernel<paddle::platform::CUDADeviceContext, double>, ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>, mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
...@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> { ...@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG->mutable_data<T>(context.GetPlace()); IG->mutable_data<T>(context.GetPlace());
T ig_size = static_cast<T>(IG->numel()); T ig_size = static_cast<T>(IG->numel());
Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size)); Eigen::DSizes<int, 1> bcast(ig_size);
EigenVector<T>::Flatten(*IG).device( EigenVector<T>::Flatten(*IG).device(
*context.template device_context<DeviceContext>().eigen_device()) = *context.template device_context<DeviceContext>().eigen_device()) =
......
...@@ -20,7 +20,6 @@ namespace plat = paddle::platform; ...@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>, ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>); ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(mul_grad,
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>, ops::MulGradKernel<plat::CUDADeviceContext, double>);
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, ...@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<plat::float16>); ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>, ops::PoolCUDNNGradOpKernel<double>);
ops::PoolCUDNNGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>, ops::PoolCUDNNOpKernel<float>,
...@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, ...@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<plat::float16>); ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>, ops::PoolCUDNNGradOpKernel<double>);
ops::PoolCUDNNGradOpKernel<plat::float16>);
...@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase { ...@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
class RecvOpMaker : public framework::OpProtoAndCheckerMaker { class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sampling_id_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class SamplingIdOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SamplingIdOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SamplingIdOp should not be null.");
PADDLE_ENFORCE(
ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
"min must less then max");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 2,
"Input(X, Filter) should be 2-D tensor.");
framework::DDim dims = input_dims;
ctx->SetOutputDim("Out", dims);
ctx->ShareLoD("X", "Out");
}
};
class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of softmax. "
"2-D with shape [batch_size, input_feature_dimensions].");
AddOutput("Out", "SamplingId data tensor.");
AddComment(R"DOC(
SamplingId Operator.
A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].")
.SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].")
.SetDefault(1.0f);
AddAttr<int>("seed",
"Random seed used for the random number engine. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. [default 0].")
.SetDefault(0);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
paddle::operators::SamplingIdKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sampling_id_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
paddle::operators::SamplingIdKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <random>
#include <sstream>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SamplingIdKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("X");
const int batch_size = static_cast<int>(input->dims()[0]);
const int width = static_cast<int>(input->dims()[1]);
PADDLE_ENFORCE_GE(batch_size, 0,
"batch_size(dims[0]) must be nonnegative.");
PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
std::vector<T> ins_vector;
framework::TensorToVector(*input, context.device_context(), &ins_vector);
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::uniform_real_distribution<T> dist(
static_cast<T>(context.Attr<float>("min")),
static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size);
for (size_t i = 0; i < batch_size; ++i) {
T r = dist(engine);
int idx = width - 1;
for (int j = 0; j < width; ++j) {
if ((r -= ins_vector[i * width + j]) < 0) {
idx = j;
break;
}
}
ids[i] = ins_vector[i * width + idx];
}
std::vector<int64_t> out_dim;
out_dim.push_back(static_cast<int64_t>(batch_size));
Tensor* output = context.Output<Tensor>("Out");
output->Resize(framework::make_ddim(out_dim));
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(ids, context.device_context(), output);
}
};
} // namespace operators
} // namespace paddle
...@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/scale_op.h" #include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
scale, scale,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>, paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>, paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
int64_t>, int64_t>);
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
...@@ -37,23 +37,20 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -37,23 +37,20 @@ class SendBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override { const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints"); std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode");
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; VLOG(3) << "SendBarrierOp sync";
// need to wait before sending send_barrier message // need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
if (sync_mode) {
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep; VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
}
}; };
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent. ...@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.") "Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"}); .SetDefault({"127.0.0.1:6164"});
AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
} }
}; };
......
...@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() { void Make() {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Send operator Send operator
......
...@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < grad.rows().size(); i++) { for (size_t i = 0; i < grad.rows().size(); i++) {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(), PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height"); "Input rows index should less than height");
int64_t id_index = param.Index(grad.rows()[i]); int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0), PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
"id should be in the table"); "id should be in the table");
for (int64_t j = 0; j < grad_row_width; j++) { for (int64_t j = 0; j < grad_row_width; j++) {
......
...@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, ...@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>, ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<plat::float16>); ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>, ops::SoftmaxGradCUDNNKernel<float>);
ops::SoftmaxGradCUDNNKernel<plat::float16>);
...@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>); ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>, softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>, ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -11,13 +11,10 @@ limitations under the License. */ ...@@ -11,13 +11,10 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>, sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
ops::SumKernel<paddle::platform::CUDADeviceContext, double>, ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>, ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
...@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if (!in_place) { if (!in_place) {
math::SetConstant<DeviceContext, T> constant_functor; math::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(), out, constant_functor(context.template device_context<DeviceContext>(), out,
static_cast<T>(0)); 0.0);
} }
math::SelectedRowsAddToTensor<DeviceContext, T> functor; math::SelectedRowsAddToTensor<DeviceContext, T> functor;
......
...@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using paddle::platform::float16;
template <typename T> template <typename T>
struct Pair { struct Pair {
...@@ -35,11 +32,6 @@ struct Pair { ...@@ -35,11 +32,6 @@ struct Pair {
id = id; id = id;
} }
__device__ __forceinline__ void clear() {
v = -INFINITY;
id = -1;
}
__device__ __forceinline__ void operator=(const Pair<T>& in) { __device__ __forceinline__ void operator=(const Pair<T>& in) {
v = in.v; v = in.v;
id = in.id; id = in.id;
...@@ -61,12 +53,6 @@ struct Pair { ...@@ -61,12 +53,6 @@ struct Pair {
int64_t id; int64_t id;
}; };
template <>
__device__ __forceinline__ void Pair<float16>::clear() {
v = platform::raw_uint16_to_float16(0x400);
id = -1;
}
template <typename T> template <typename T>
__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p, __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
int beam_size) { int beam_size) {
...@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - (*beam)) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].clear(); topk[k].set(-INFINITY, -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
} }
*max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if ((*max).v == static_cast<T>(-1)) *is_empty = true; if ((*max).v == -1) *is_empty = true;
*beam = 0; *beam = 0;
} }
} }
...@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - *beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(std::numeric_limits<T>::min(), -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool firststep = true; bool firststep = true;
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
topk[k].clear(); topk[k].set(-INFINITY, -1);
} }
while (k) { while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
...@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
top_k, paddle::operators::TopkOpCUDAKernel<float>, paddle::operators::TopkOpCUDAKernel<double>);
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
...@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = ctx.Attr<std::vector<int>>("shape"); auto shape = ctx.Attr<std::vector<int>>("shape");
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value(); auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
tensor = selected_rows->mutable_value();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
selected_rows->mutable_rows()->reserve(shape[0]);
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"uniform_random_op's output only" "uniform_random_op's output only"
......
...@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h> #include <thrust/random.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,11 +36,6 @@ struct UniformGenerator { ...@@ -40,11 +36,6 @@ struct UniformGenerator {
} }
}; };
template <typename T, typename V>
struct CastFunctor {
HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT. // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to // Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random. // implement uniform random.
...@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T max = static_cast<T>(context.Attr<float>("max")); T max = static_cast<T>(context.Attr<float>("max"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
if (out_var->IsType<framework::LoDTensor>() &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
framework::Tensor master_copy_tensor;
master_copy_tensor.Resize(tensor->dims());
float* master_copy_tensor_data =
master_copy_tensor.mutable_data<float>(context.GetPlace());
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<float>(master_copy_tensor_data),
UniformGenerator<float>(static_cast<float>(min),
static_cast<float>(max), seed));
platform::Transform<platform::CUDADeviceContext> trans;
auto* in_begin = master_copy_tensor.data<float>();
auto* in_end = in_begin + master_copy_tensor.numel();
auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
trans(context.template device_context<platform::CUDADeviceContext>(),
in_begin, in_end, out_begin, CastFunctor<float, T>());
} else {
thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
UniformGenerator<T>(min, max, seed)); UniformGenerator<T>(min, max, seed));
} }
if (VLOG_IS_ON(5)) {
framework::Tensor cpu_tensor;
framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
auto& dev_ctx =
*platform::DeviceContextPool::Instance().Get(context.GetPlace());
dev_ctx.Wait();
auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
VLOG(5) << "The Uniform output " << x;
}
}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(uniform_random,
REGISTER_OP_CUDA_KERNEL( paddle::operators::GPUUniformRandomKernel<float>,
uniform_random, paddle::operators::GPUUniformRandomKernel<float>, paddle::operators::GPUUniformRandomKernel<double>);
paddle::operators::GPUUniformRandomKernel<double>, REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>, paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>, paddle::operators::GPUUniformRandomKernel<double>);
paddle::operators::GPUUniformRandomKernel<plat::float16>);
...@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, ...@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
#if CUDA_VERSION < 9000 #if CUDA_VERSION < 9000
return __shfl_down(val, delta, width); return __shfl_down(val, delta, width);
#else #else
return __shfl_down_sync(mask, val, delta, width); return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
#endif #endif
} }
...@@ -46,9 +46,16 @@ template <> ...@@ -46,9 +46,16 @@ template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta, float16 val, int delta,
int width) { int width) {
half tmp = static_cast<half>(val); return float16(
__shfl_down(tmp, static_cast<unsigned>(delta), width); __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
return float16(tmp); }
#else
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
return float16(__shfl_down_sync(mask, static_cast<half>(val),
static_cast<unsigned>(delta), width));
} }
#endif #endif
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm>
#include <iostream> #include <iostream>
#include <random> #include <random>
...@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) { ...@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
for (size_t i = 0; i < num / 2; ++i) { for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow // NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result. // so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(out[i]), EXPECT_NEAR(static_cast<float>(out[i]),
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])), static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
...@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) { ...@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3); TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3); TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
} }
// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
template <typename T>
static __forceinline__ __device__ T WarpReduceSum(T val) {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
}
return val;
}
template <typename T>
__forceinline__ __device__ T BlockReduce(T val) {
static __shared__ T shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = WarpReduceSum(val); // Each warp performs partial reduction
if (lane == 0) shared[wid] = val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
// read from shared memory only if that warp existed
val =
(threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp
return val;
}
template <typename T>
__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
T sum(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
sum += in[i];
}
sum = BlockReduce<T>(sum);
__syncthreads();
if (threadIdx.x == 0) out[blockIdx.x] = sum;
}
template <typename T>
void TestReduce(size_t num, float atol = 0.01) {
T* in1;
T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
in1 = reinterpret_cast<T*>(malloc(size));
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (size_t i = 0; i < num; ++i) {
in1[i] = static_cast<T>(dist(engine));
}
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
free(in1);
cudaFree(d_in1);
cudaFree(d_in2);
}
TEST(CudaShuffleSync, float16) {
TestReduce<float>(10);
TestReduce<float>(1000);
// float16 will overflow or accumulate truncate errors in big size.
TestReduce<float16>(10);
TestReduce<float16>(100, /*atol error*/ 1.0);
}
...@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) { ...@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) {
void BindVarDsec(pybind11::module *m) { void BindVarDsec(pybind11::module *m) {
pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", ""); pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
var_desc var_desc
.def("name", .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
[](pd::VarDesc &self) {
pybind11::bytes name = self.Name();
return name;
},
pybind11::return_value_policy::reference)
.def("set_name", &pd::VarDesc::SetName) .def("set_name", &pd::VarDesc::SetName)
.def("set_shape", &pd::VarDesc::SetShape) .def("set_shape", &pd::VarDesc::SetShape)
.def("set_shapes", &pd::VarDesc::SetShapes) .def("set_shapes", &pd::VarDesc::SetShapes)
......
...@@ -54,6 +54,8 @@ limitations under the License. */ ...@@ -54,6 +54,8 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
#include "pybind11/stl.h"
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
...@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) { ...@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) {
self.set_rows(new_rows); self.set_rows(new_rows);
#endif #endif
}) })
.def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
.def("rows", [](SelectedRows &self) { .def("rows", [](SelectedRows &self) {
auto rows = self.rows(); auto rows = self.rows();
std::vector<int64_t> new_rows; std::vector<int64_t> new_rows;
......
...@@ -54,7 +54,7 @@ function cpu_config() { ...@@ -54,7 +54,7 @@ function cpu_config() {
if [ $platform == "Linux" ]; then if [ $platform == "Linux" ]; then
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
elif [ $platform == "Darwin" ]; then elif [ $platform == "Darwin" ]; then
if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
# HT is OFF # HT is OFF
ht=1 ht=1
fi fi
......
...@@ -24,4 +24,5 @@ except ImportError: ...@@ -24,4 +24,5 @@ except ImportError:
import paddle.reader import paddle.reader
import paddle.dataset import paddle.dataset
import paddle.batch import paddle.batch
import paddle.compat
batch = batch.batch batch = batch.batch
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import math
__all__ = [
'long_type',
'to_text',
'to_bytes',
'round',
'floor_division',
'get_exception_message',
]
if six.PY2:
int_type = int
long_type = long
else:
int_type = int
long_type = int
# str and bytes related functions
def to_text(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a literal string without any encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to literal string.
In Python3:
Decode the bytes type object to str type with specific encoding
In Python2:
Decode the str type object to unicode type with specific encoding
Args:
obj(unicode|str|bytes|list|set) : The object to be decoded.
encoding(str) : The encoding format to decode a string
inplace(bool) : If we change the original object or we create a new one
Returns:
Decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, list):
if inplace:
for i in six.moves.xrange(len(obj)):
obj[i] = _to_text(obj[i], encoding)
return obj
else:
return [_to_text(item, encoding) for item in obj]
elif isinstance(obj, set):
if inplace:
for item in obj:
obj.remove(item)
obj.add(_to_text(item, encoding))
return obj
else:
return set([_to_text(item, encoding) for item in obj])
else:
return _to_text(obj, encoding)
def _to_text(obj, encoding):
"""
In Python3:
Decode the bytes type object to str type with specific encoding
In Python2:
Decode the str type object to unicode type with specific encoding,
or we just return the unicode string of object
Args:
obj(unicode|str|bytes) : The object to be decoded.
encoding(str) : The encoding format
Returns:
decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, six.binary_type):
return obj.decode(encoding)
elif isinstance(obj, six.text_type):
return obj
else:
return six.u(obj)
def to_bytes(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a bytes with specific encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to bytes.
In Python3:
Encode the str type object to bytes type with specific encoding
In Python2:
Encode the unicode type object to str type with specific encoding,
or we just return the 8-bit string of object
Args:
obj(unicode|str|bytes|list|set) : The object to be encoded.
encoding(str) : The encoding format to encode a string
inplace(bool) : If we change the original object or we create a new one
Returns:
Decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, list):
if inplace:
for i in six.moves.xrange(len(obj)):
obj[i] = _to_bytes(obj[i], encoding)
return obj
else:
return [_to_bytes(item, encoding) for item in obj]
elif isinstance(obj, set):
if inplace:
for item in obj:
obj.remove(item)
obj.add(_to_bytes(item, encoding))
return obj
else:
return set([_to_bytes(item, encoding) for item in obj])
else:
return _to_bytes(obj, encoding)
def _to_bytes(obj, encoding):
"""
In Python3:
Encode the str type object to bytes type with specific encoding
In Python2:
Encode the unicode type object to str type with specific encoding,
or we just return the 8-bit string of object
Args:
obj(unicode|str|bytes) : The object to be encoded.
encoding(str) : The encoding format
Returns:
encoded result of obj
"""
if obj is None:
return obj
assert encoding is not None
if isinstance(obj, six.text_type):
return obj.encode(encoding)
elif isinstance(obj, six.binary_type):
return obj
else:
return six.b(obj)
# math related functions
def round(x, d=0):
"""
Compatible round which act the same behaviour in Python3.
Args:
x(float) : The number to round halfway.
Returns:
round result of x
"""
if six.PY3:
# The official walkaround of round in Python3 is incorrect
# we implement accroding this answer: https://www.techforgeek.info/round_python.html
if x > 0.0:
p = 10**d
return float(math.floor((x * p) + math.copysign(0.5, x))) / p
elif x < 0.0:
p = 10**d
return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
else:
return math.copysign(0.0, x)
else:
import __builtin__
return __builtin__.round(x, d)
def floor_division(x, y):
"""
Compatible division which act the same behaviour in Python3 and Python2,
whose result will be a int value of floor(x / y) in Python3 and value of
(x / y) in Python2.
Args:
x(int|float) : The number to divide.
y(int|float) : The number to be divided
Returns:
division result of x // y
"""
return x // y
# exception related functions
def get_exception_message(exc):
"""
Get the error message of a specific exception
Args:
exec(Exception) : The exception to get error message.
Returns:
the error message of exec
"""
assert exc is not None
if six.PY2:
return exc.message
else:
return str(exc)
...@@ -28,11 +28,13 @@ images per class. ...@@ -28,11 +28,13 @@ images per class.
""" """
from __future__ import print_function
import itertools import itertools
import numpy import numpy
import paddle.dataset.common import paddle.dataset.common
import tarfile import tarfile
from six.moves import zip import six
from six.moves import cPickle as pickle from six.moves import cPickle as pickle
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
...@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' ...@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
def reader_creator(filename, sub_name, cycle=False): def reader_creator(filename, sub_name, cycle=False):
def read_batch(batch): def read_batch(batch):
data = batch['data'] data = batch[six.b('data')]
labels = batch.get('labels', batch.get('fine_labels', None)) labels = batch.get(
six.b('labels'), batch.get(six.b('fine_labels'), None))
assert labels is not None assert labels is not None
for sample, label in zip(data, labels): for sample, label in six.moves.zip(data, labels):
yield (sample / 255.0).astype(numpy.float32), int(label) yield (sample / 255.0).astype(numpy.float32), int(label)
def reader(): def reader():
...@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False): ...@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False):
while True: while True:
for name in names: for name in names:
if six.PY2:
batch = pickle.load(f.extractfile(name)) batch = pickle.load(f.extractfile(name))
else:
batch = pickle.load(
f.extractfile(name), encoding='bytes')
for item in read_batch(batch): for item in read_batch(batch):
yield item yield item
if not cycle: if not cycle:
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import requests import requests
import hashlib import hashlib
import os import os
...@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None):
total_length = r.headers.get('content-length') total_length = r.headers.get('content-length')
if total_length is None: if total_length is None:
with open(filename, 'w') as f: with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f) shutil.copyfileobj(r.raw, f)
else: else:
with open(filename, 'w') as f: with open(filename, 'wb') as f:
dl = 0 dl = 0
total_length = int(total_length) total_length = int(total_length)
for data in r.iter_content(chunk_size=4096): for data in r.iter_content(chunk_size=4096):
......
...@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used ...@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
to initialize SRL model. to initialize SRL model.
""" """
from __future__ import print_function
import tarfile import tarfile
import gzip import gzip
import itertools import itertools
import paddle.dataset.common import paddle.dataset.common
from six.moves import zip import paddle.compat as cpt
from six.moves import zip, range
__all__ = ['test, get_dict', 'get_embedding', 'convert'] __all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt' WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
...@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name): ...@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name):
labels = [] labels = []
one_seg = [] one_seg = []
for word, label in zip(words_file, props_file): for word, label in zip(words_file, props_file):
word = word.strip() word = cpt.to_text(word.strip())
label = label.strip().split() label = cpt.to_text(label.strip().split())
if len(label) == 0: # end of sentence if len(label) == 0: # end of sentence
for i in range(len(one_seg[0])): for i in range(len(one_seg[0])):
......
...@@ -28,6 +28,9 @@ Graphics and Image Processing (2008) ...@@ -28,6 +28,9 @@ Graphics and Image Processing (2008)
http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}. http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
""" """
from __future__ import print_function
import itertools import itertools
import functools import functools
from .common import download from .common import download
...@@ -116,7 +119,7 @@ def reader_creator(data_file, ...@@ -116,7 +119,7 @@ def reader_creator(data_file,
for file in open(file_list): for file in open(file_list):
file = file.strip() file = file.strip()
batch = None batch = None
with open(file, 'r') as f: with open(file, 'rb') as f:
batch = pickle.load(f) batch = pickle.load(f)
data = batch['data'] data = batch['data']
labels = batch['label'] labels = batch['label']
......
...@@ -29,10 +29,18 @@ the image layout as follows. ...@@ -29,10 +29,18 @@ the image layout as follows.
formats can be used for training. Noted that, the format should formats can be used for training. Noted that, the format should
be keep consistent between the training and inference peroid. be keep consistent between the training and inference peroid.
""" """
from __future__ import print_function
import numpy as np import numpy as np
try: try:
import cv2 import cv2
except ImportError: except ImportError:
import sys
sys.stderr.write(
'''Warning with paddle image module: opencv-python should be imported,
or paddle image module could NOT work; please install opencv-python first.'''
)
cv2 = None cv2 = None
import os import os
import tarfile import tarfile
...@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file, ...@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file,
output['data'] = data output['data'] = data
pickle.dump( pickle.dump(
output, output,
open('%s/batch_%d' % (out_path, file_id), 'w'), open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL) protocol=pickle.HIGHEST_PROTOCOL)
file_id += 1 file_id += 1
data = [] data = []
...@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file, ...@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file,
output['data'] = data output['data'] = data
pickle.dump( pickle.dump(
output, output,
open('%s/batch_%d' % (out_path, file_id), 'w'), open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL) protocol=pickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta: with open(meta_file, 'a') as meta:
...@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True): ...@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True):
load and return a gray image. load and return a gray image.
:type is_color: bool :type is_color: bool
""" """
assert cv2 is not None
flag = 1 if is_color else 0 flag = 1 if is_color else 0
file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
img = cv2.imdecode(file_bytes, flag) img = cv2.imdecode(file_bytes, flag)
...@@ -149,6 +159,8 @@ def load_image(file, is_color=True): ...@@ -149,6 +159,8 @@ def load_image(file, is_color=True):
load and return a gray image. load and return a gray image.
:type is_color: bool :type is_color: bool
""" """
assert cv2 is not None
# cv2.IMAGE_COLOR for OpenCV3 # cv2.IMAGE_COLOR for OpenCV3
# cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
# cv2.IMAGE_GRAYSCALE for OpenCV3 # cv2.IMAGE_GRAYSCALE for OpenCV3
...@@ -176,12 +188,14 @@ def resize_short(im, size): ...@@ -176,12 +188,14 @@ def resize_short(im, size):
:param size: the shorter edge size of image after resizing. :param size: the shorter edge size of image after resizing.
:type size: int :type size: int
""" """
assert cv2 is not None
h, w = im.shape[:2] h, w = im.shape[:2]
h_new, w_new = size, size h_new, w_new = size, size
if h > w: if h > w:
h_new = size * h / w h_new = size * h // w
else: else:
w_new = size * w / h w_new = size * w // h
im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
return im return im
...@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True): ...@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True):
:type is_color: bool :type is_color: bool
""" """
h, w = im.shape[:2] h, w = im.shape[:2]
h_start = (h - size) / 2 h_start = (h - size) // 2
w_start = (w - size) / 2 w_start = (w - size) // 2
h_end, w_end = h_start + size, w_start + size h_end, w_end = h_start + size, w_start + size
if is_color: if is_color:
im = im[h_start:h_end, w_start:w_end, :] im = im[h_start:h_end, w_start:w_end, :]
......
...@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing. ...@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary. Besides, this module also provides API for building dictionary.
""" """
from __future__ import print_function
import paddle.dataset.common import paddle.dataset.common
import collections import collections
import tarfile import tarfile
import re import re
import string import string
import six
__all__ = ['build_dict', 'train', 'test', 'convert'] __all__ = ['build_dict', 'train', 'test', 'convert']
...@@ -42,13 +45,14 @@ def tokenize(pattern): ...@@ -42,13 +45,14 @@ def tokenize(pattern):
# sequential access of member files, other than # sequential access of member files, other than
# tarfile.extractfile, which does random access and might # tarfile.extractfile, which does random access and might
# destroy hard disks. # destroy hard disks.
tf = next(tarf) tf = tarf.next()
while tf != None: while tf != None:
if bool(pattern.match(tf.name)): if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization. # newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip("\n\r").translate( yield tarf.extractfile(tf).read().rstrip(six.b(
None, string.punctuation).lower().split() "\n\r")).translate(
tf = next(tarf) None, six.b(string.punctuation)).lower().split()
tf = tarf.next()
def build_dict(pattern, cutoff): def build_dict(pattern, cutoff):
...@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff): ...@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
word_freq[word] += 1 word_freq[word] += 1
# Not sure if we should prune less-frequent words here. # Not sure if we should prune less-frequent words here.
word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff] word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary)) words, _ = list(zip(*dictionary))
word_idx = dict(list(zip(words, list(range(len(words)))))) word_idx = dict(list(zip(words, six.moves.range(len(words)))))
word_idx['<unk>'] = len(words) word_idx['<unk>'] = len(words)
return word_idx return word_idx
......
...@@ -18,9 +18,13 @@ This module will download dataset from ...@@ -18,9 +18,13 @@ This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators. into paddle reader creators.
""" """
from __future__ import print_function
import paddle.dataset.common import paddle.dataset.common
import collections import collections
import tarfile import tarfile
import six
__all__ = ['train', 'test', 'build_dict', 'convert'] __all__ = ['train', 'test', 'build_dict', 'convert']
...@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50): ...@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
# remove <unk> for now, since we will set it as last index # remove <unk> for now, since we will set it as last index
del word_freq['<unk>'] del word_freq['<unk>']
word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq] word_freq = [
x for x in six.iteritems(word_freq) if x[1] > min_word_freq
]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted)) words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, list(range(len(words)))))) word_idx = dict(list(zip(words, six.moves.range(len(words)))))
word_idx['<unk>'] = len(words) word_idx['<unk>'] = len(words)
return word_idx return word_idx
...@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type): ...@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
l = ['<s>'] + l.strip().split() + ['<e>'] l = ['<s>'] + l.strip().split() + ['<e>']
if len(l) >= n: if len(l) >= n:
l = [word_idx.get(w, UNK) for w in l] l = [word_idx.get(w, UNK) for w in l]
for i in range(n, len(l) + 1): for i in six.moves.range(n, len(l) + 1):
yield tuple(l[i - n:i]) yield tuple(l[i - n:i])
elif DataType.SEQ == data_type: elif DataType.SEQ == data_type:
l = l.strip().split() l = l.strip().split()
......
...@@ -17,10 +17,15 @@ MNIST dataset. ...@@ -17,10 +17,15 @@ MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse training set and test set into paddle reader creators. parse training set and test set into paddle reader creators.
""" """
from __future__ import print_function
import paddle.dataset.common import paddle.dataset.common
import subprocess import subprocess
import numpy import numpy
import platform import platform
import tempfile
from six.moves import range
__all__ = ['train', 'test', 'convert'] __all__ = ['train', 'test', 'convert']
URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
...@@ -45,23 +50,28 @@ def reader_creator(image_filename, label_filename, buffer_size): ...@@ -45,23 +50,28 @@ def reader_creator(image_filename, label_filename, buffer_size):
# According to http://stackoverflow.com/a/38061619/724872, we # According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here. # cannot use standard package gzip here.
m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE) tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
m.stdout.read(16) # skip some magic bytes m = subprocess.Popen(
[zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
tmp_image_file.seek(16) # skip some magic bytes
l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE) # Python3 will not take stdout as file
l.stdout.read(8) # skip some magic bytes tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
l = subprocess.Popen(
[zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
tmp_label_file.seek(8) # skip some magic bytes
try: # reader could be break. try: # reader could be break.
while True: while True:
labels = numpy.fromfile( labels = numpy.fromfile(
l.stdout, 'ubyte', count=buffer_size).astype("int") tmp_label_file, 'ubyte', count=buffer_size).astype("int")
if labels.size != buffer_size: if labels.size != buffer_size:
break # numpy.fromfile returns empty slice after EOF. break # numpy.fromfile returns empty slice after EOF.
images = numpy.fromfile( images = numpy.fromfile(
m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape( tmp_image_file, 'ubyte', count=buffer_size * 28 *
(buffer_size, 28 * 28)).astype('float32') 28).reshape((buffer_size, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0 images = images / 255.0 * 2.0 - 1.0
......
...@@ -22,11 +22,15 @@ set and test set into paddle reader creators. ...@@ -22,11 +22,15 @@ set and test set into paddle reader creators.
""" """
from __future__ import print_function
import zipfile import zipfile
import paddle.dataset.common import paddle.dataset.common
import re import re
import random import random
import functools import functools
import six
import paddle.compat as cpt
__all__ = [ __all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
...@@ -112,6 +116,7 @@ def __initialize_meta_info__(): ...@@ -112,6 +116,7 @@ def __initialize_meta_info__():
categories_set = set() categories_set = set()
with package.open('ml-1m/movies.dat') as movie_file: with package.open('ml-1m/movies.dat') as movie_file:
for i, line in enumerate(movie_file): for i, line in enumerate(movie_file):
line = cpt.to_text(line, encoding='latin')
movie_id, title, categories = line.strip().split('::') movie_id, title, categories = line.strip().split('::')
categories = categories.split('|') categories = categories.split('|')
for c in categories: for c in categories:
...@@ -136,6 +141,7 @@ def __initialize_meta_info__(): ...@@ -136,6 +141,7 @@ def __initialize_meta_info__():
USER_INFO = dict() USER_INFO = dict()
with package.open('ml-1m/users.dat') as user_file: with package.open('ml-1m/users.dat') as user_file:
for line in user_file: for line in user_file:
line = cpt.to_text(line, encoding='latin')
uid, gender, age, job, _ = line.strip().split("::") uid, gender, age, job, _ = line.strip().split("::")
USER_INFO[int(uid)] = UserInfo( USER_INFO[int(uid)] = UserInfo(
index=uid, gender=gender, age=age, job_id=job) index=uid, gender=gender, age=age, job_id=job)
...@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): ...@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
with zipfile.ZipFile(file=fn) as package: with zipfile.ZipFile(file=fn) as package:
with package.open('ml-1m/ratings.dat') as rating: with package.open('ml-1m/ratings.dat') as rating:
for line in rating: for line in rating:
line = cpt.to_text(line, encoding='latin')
if (rand.random() < test_ratio) == is_test: if (rand.random() < test_ratio) == is_test:
uid, mov_id, rating, _ = line.strip().split("::") uid, mov_id, rating, _ = line.strip().split("::")
uid = int(uid) uid = int(uid)
...@@ -187,7 +194,7 @@ def max_movie_id(): ...@@ -187,7 +194,7 @@ def max_movie_id():
Get the maximum value of movie id. Get the maximum value of movie id.
""" """
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_index_info__, list(MOVIE_INFO.values())).index return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
def max_user_id(): def max_user_id():
...@@ -195,7 +202,7 @@ def max_user_id(): ...@@ -195,7 +202,7 @@ def max_user_id():
Get the maximum value of user id. Get the maximum value of user id.
""" """
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_index_info__, list(USER_INFO.values())).index return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
def __max_job_id_impl__(a, b): def __max_job_id_impl__(a, b):
...@@ -210,7 +217,8 @@ def max_job_id(): ...@@ -210,7 +217,8 @@ def max_job_id():
Get the maximum value of job id. Get the maximum value of job id.
""" """
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id return six.moves.reduce(__max_job_id_impl__,
list(USER_INFO.values())).job_id
def movie_categories(): def movie_categories():
......
...@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20 ...@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
""" """
from __future__ import print_function
import os import os
import functools import functools
import rarfile import rarfile
......
...@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK ...@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
TODO(yuyang18): Complete dataset. TODO(yuyang18): Complete dataset.
""" """
from __future__ import print_function
import six
import collections import collections
from itertools import chain from itertools import chain
...@@ -64,7 +67,7 @@ def get_word_dict(): ...@@ -64,7 +67,7 @@ def get_word_dict():
for field in movie_reviews.fileids(category): for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field): for words in movie_reviews.words(field):
word_freq_dict[words] += 1 word_freq_dict[words] += 1
words_sort_list = list(word_freq_dict.items()) words_sort_list = six.iteritems(word_freq_dict)
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1]) words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list): for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index)) words_freq_sorted.append((word[0], index))
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.cifar import paddle.dataset.cifar
import unittest import unittest
......
...@@ -12,10 +12,13 @@ ...@@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.common import paddle.dataset.common
import unittest import unittest
import tempfile import tempfile
import glob import glob
from six.moves import range
class TestCommon(unittest.TestCase): class TestCommon(unittest.TestCase):
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.flowers import paddle.dataset.flowers
import unittest import unittest
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.imdb import paddle.dataset.imdb
import unittest import unittest
import re import re
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.imikolov import paddle.dataset.imikolov
import unittest import unittest
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.mnist import paddle.dataset.mnist
import unittest import unittest
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.mq2007 import paddle.dataset.mq2007
import unittest import unittest
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import unittest import unittest
import nltk import nltk
import paddle.dataset.sentiment as st import paddle.dataset.sentiment as st
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.voc2012 import paddle.dataset.voc2012
import unittest import unittest
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import paddle.dataset.wmt16 import paddle.dataset.wmt16
import unittest import unittest
......
...@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and ...@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse training set and test set into paddle reader creators. parse training set and test set into paddle reader creators.
""" """
import os from __future__ import print_function
import numpy as np import numpy as np
import six
import tempfile import tempfile
import tarfile import tarfile
import os import os
...@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8): ...@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
return return
data = np.fromfile(filename, sep=' ') data = np.fromfile(filename, sep=' ')
data = data.reshape(data.shape[0] / feature_num, feature_num) data = data.reshape(data.shape[0] // feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum( maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0] axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1]) feature_range(maximums[:-1], minimums[:-1])
for i in range(feature_num - 1): for i in six.moves.range(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i]) data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio) offset = int(data.shape[0] * ratio)
UCI_TRAIN_DATA = data[:offset] UCI_TRAIN_DATA = data[:offset]
......
...@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images ...@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
with segmentation has been increased from 7,062 to 9,993. with segmentation has been increased from 7,062 to 9,993.
""" """
from __future__ import print_function
import tarfile import tarfile
import io import io
import numpy as np import numpy as np
......
...@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and ...@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators. parse training set and test set into paddle reader creators.
""" """
from __future__ import print_function
import six
import tarfile import tarfile
import gzip import gzip
import paddle.dataset.common import paddle.dataset.common
import paddle.compat as cpt
__all__ = [ __all__ = [
'train', 'train',
...@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size): ...@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
out_dict = dict() out_dict = dict()
for line_count, line in enumerate(fd): for line_count, line in enumerate(fd):
if line_count < size: if line_count < size:
out_dict[line.strip()] = line_count out_dict[cpt.to_text(line.strip())] = line_count
else: else:
break break
return out_dict return out_dict
...@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size): ...@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
] ]
for name in names: for name in names:
for line in f.extractfile(name): for line in f.extractfile(name):
line_split = line.strip().split('\t') line_split = line.strip().split(six.b('\t'))
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_seq = line_split[0] # one source sequence src_seq = line_split[0] # one source sequence
...@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True): ...@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True):
tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
src_dict, trg_dict = __read_to_dict(tar_file, dict_size) src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
if reverse: if reverse:
src_dict = {v: k for k, v in list(src_dict.items())} src_dict = {v: k for k, v in six.iteritems(src_dict)}
trg_dict = {v: k for k, v in list(trg_dict.items())} trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
return src_dict, trg_dict return src_dict, trg_dict
......
...@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions. ...@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
} }
""" """
from __future__ import print_function
import os import os
import six
import tarfile import tarfile
import gzip import gzip
from collections import defaultdict from collections import defaultdict
import paddle.dataset.common import paddle.dataset.common
import paddle.compat as cpt
__all__ = [ __all__ = [
"train", "train",
...@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int) word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"): for line in f.extractfile("wmt16/train"):
line_split = line.strip().split("\t") line_split = line.strip().split(six.b("\t"))
if len(line_split) != 2: continue if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1] sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split(): for w in sen.split():
...@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)) fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
for idx, word in enumerate( for idx, word in enumerate(
sorted( sorted(
iter(list(word_dict.items())), six.iteritems(word_dict), key=lambda x: x[1],
key=lambda x: x[1],
reverse=True)): reverse=True)):
if idx + 3 == dict_size: break if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0])) fout.write("%s\n" % (word[0]))
...@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False): ...@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
dict_path = os.path.join(paddle.dataset.common.DATA_HOME, dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size)) "wmt16/%s_%d.dict" % (lang, dict_size))
if not os.path.exists(dict_path) or ( if not os.path.exists(dict_path) or (
len(open(dict_path, "r").readlines()) != dict_size): len(open(dict_path, "rb").readlines()) != dict_size):
__build_dict(tar_file, dict_size, dict_path, lang) __build_dict(tar_file, dict_size, dict_path, lang)
word_dict = {} word_dict = {}
with open(dict_path, "r") as fdict: with open(dict_path, "rb") as fdict:
for idx, line in enumerate(fdict): for idx, line in enumerate(fdict):
if reverse: if reverse:
word_dict[idx] = line.strip() word_dict[idx] = cpt.to_text(line.strip())
else: else:
word_dict[line.strip()] = idx word_dict[cpt.to_text(line.strip())] = idx
return word_dict return word_dict
...@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): ...@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
with tarfile.open(tar_file, mode="r") as f: with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name): for line in f.extractfile(file_name):
line_split = line.strip().split("\t") line_split = line.strip().split(six.b("\t"))
if len(line_split) != 2: if len(line_split) != 2:
continue continue
src_words = line_split[src_col].split() src_words = line_split[src_col].split()
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import numpy as np import numpy as np
import warnings import warnings
""" """
......
...@@ -12,11 +12,14 @@ ...@@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from paddle.fluid import framework as framework from paddle.fluid import framework as framework
from . import core from . import core
import collections import collections
import copy import copy
import six import six
from .. import compat as cpt
from . import unique_name from . import unique_name
__all__ = ['append_backward'] __all__ = ['append_backward']
...@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): ...@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
""" """
op_desc = core.OpDesc() op_desc = core.OpDesc()
op_desc.set_type(op_type) op_desc.set_type(op_type)
for para, args in list(inputs.items()): for para, args in six.iteritems(inputs):
op_desc.set_input( op_desc.set_input(
para, para,
list( list(
map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg, map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
args))) args)))
for para, args in list(outputs.items()): for para, args in six.iteritems(outputs):
op_desc.set_output( op_desc.set_output(
para, para,
list( list(
...@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): ...@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
if op_role_attr_name not in attrs: if op_role_attr_name not in attrs:
attrs[ attrs[
op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
for name, val in list(attrs.items()): for name, val in six.iteritems(attrs):
if isinstance(val, framework.Block): if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc) op_desc.set_block_attr(name, val.desc)
else: else:
...@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block): ...@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
""" """
Infer the data type of given grad variable Infer the data type of given grad variable
""" """
grad_var = block.desc.find_var(grad_var_name.encode("ascii")) grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii")) fwd_name = _strip_grad_suffix_(grad_var_name)
if block.desc.has_var_recursive(fwd_name): if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii")) fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
grad_var.set_dtype(fwd_var.dtype()) grad_var.set_dtype(fwd_var.dtype())
else: else:
grad_var.set_dtype(core.VarDesc.VarType.FP32) grad_var.set_dtype(core.VarDesc.VarType.FP32)
...@@ -102,8 +105,10 @@ def _some_in_set_(cands, s): ...@@ -102,8 +105,10 @@ def _some_in_set_(cands, s):
""" """
if len(cands) == 0: if len(cands) == 0:
return False return False
for c in cands: literal_set = cpt.to_text(s)
if c in s: literal_cands = cpt.to_text(cands)
for c in literal_cands:
if c in literal_set:
return True return True
return False return False
...@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name): ...@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name):
e.g. x@GRAD ==> x e.g. x@GRAD ==> x
y@GRAD@RENAME@1 ==> y y@GRAD@RENAME@1 ==> y
""" """
if isinstance(name, six.text_type): name = cpt.to_text(name)
name = name.encode() pos = name.find(core.grad_var_suffix())
pos = name.find(six.b(core.grad_var_suffix()))
return name[:pos] if pos != -1 else name return name[:pos] if pos != -1 else name
...@@ -125,9 +129,7 @@ def _append_grad_suffix_(name): ...@@ -125,9 +129,7 @@ def _append_grad_suffix_(name):
Append grad suffix to the given variable name Append grad suffix to the given variable name
e.g. x ==> x@GRAD e.g. x ==> x@GRAD
""" """
if isinstance(name, six.text_type): return cpt.to_text(name) + core.grad_var_suffix()
name = name.encode()
return name + six.b(core.grad_var_suffix())
def _addup_repetitive_outputs_(op_descs): def _addup_repetitive_outputs_(op_descs):
...@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs):
op_desc.set_output(param_name, arg_names) op_desc.set_output(param_name, arg_names)
renamed_vars[var_name].append(new_name) renamed_vars[var_name].append(new_name)
for var_name, inputs in list(renamed_vars.items()): for var_name, inputs in six.iteritems(renamed_vars):
if len(inputs) > 1: if len(inputs) > 1:
pending_sum_ops.append( pending_sum_ops.append(
(_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]}, (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
...@@ -243,7 +245,7 @@ from .proto import framework_pb2 ...@@ -243,7 +245,7 @@ from .proto import framework_pb2
def serialize_op_decs(op_desc): def serialize_op_decs(op_desc):
protostr = op_desc.serialize_to_string() protostr = op_desc.serialize_to_string()
proto = framework_pb2.OpDesc.FromString(str(protostr)) proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
return proto.__str__() return proto.__str__()
...@@ -364,7 +366,7 @@ def _append_backward_ops_(block, ...@@ -364,7 +366,7 @@ def _append_backward_ops_(block,
# Getting op's corresponding grad_op # Getting op's corresponding grad_op
grad_op_desc, op_grad_to_var = core.get_grad_op_desc( grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, no_grad_dict[block.idx], grad_sub_block_list) op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
grad_op_descs.extend(grad_op_desc) grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var) grad_to_var.update(op_grad_to_var)
...@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
for grad_var_name in op_desc.output_arg_names(): for grad_var_name in op_desc.output_arg_names():
grad_var_name = grad_var_name.encode("ascii") if block.desc.has_var_recursive(cpt.to_bytes(
if block.desc.has_var_recursive( grad_var_name)) or grad_var_name == core.empty_var_name():
grad_var_name) or grad_var_name == core.empty_var_name():
continue continue
block.desc.var(grad_var_name) block.desc.var(cpt.to_bytes(grad_var_name))
new_vars.add(grad_var_name) new_vars.add(grad_var_name)
if grad_var_name not in grad_to_var: if grad_var_name not in grad_to_var:
continue continue
...@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): ...@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
op_desc.rename_output(name, new_name) op_desc.rename_output(name, new_name)
var_map[name] = new_name var_map[name] = new_name
for g, ng in list(var_map.items()): for g, ng in six.iteritems(var_map):
if g in grad_to_var: if g in grad_to_var:
grad_to_var[ng] = grad_to_var[g] grad_to_var[ng] = grad_to_var[g]
grad_to_var.pop(g) grad_to_var.pop(g)
...@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
parameters = parameter_list parameters = parameter_list
else: else:
params = program.global_block().all_parameters() params = program.global_block().all_parameters()
program.global_block().iter_parameters()
parameters = [param.name for param in params] parameters = [param.name for param in params]
params_and_grads = [] params_and_grads = []
for param in parameters: for param in parameters:
if param not in grad_info_map: if cpt.to_text(param) not in grad_info_map:
continue continue
grad_info = grad_info_map[param] grad_info = grad_info_map[param]
grad_block = grad_info[1] grad_block = grad_info[1]
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import copy import copy
import six import six
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from .layers.control_flow import BlockGuard, equal from .layers.control_flow import BlockGuard, equal
from .framework import Operator from .framework import Operator
from .layer_helper import LayerHelper, unique_name from .layer_helper import LayerHelper, unique_name
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from . import decoder from . import decoder
from .decoder import * from .decoder import *
from . import memory_usage_calc from . import memory_usage_calc
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from . import beam_search_decoder from . import beam_search_decoder
from .beam_search_decoder import * from .beam_search_decoder import *
......
...@@ -20,6 +20,8 @@ without using the low level API such as while ops. ...@@ -20,6 +20,8 @@ without using the low level API such as while ops.
This API is still under active development and may change drastically. This API is still under active development and may change drastically.
""" """
from __future__ import print_function
import contextlib import contextlib
import numpy as np import numpy as np
import six import six
......
...@@ -20,6 +20,10 @@ batch size to fully utilize a GPU. ...@@ -20,6 +20,10 @@ batch size to fully utilize a GPU.
This API is still under active development and may change drastically. This API is still under active development and may change drastically.
""" """
from __future__ import print_function
import six
from .. import core from .. import core
from ..framework import Program, Variable from ..framework import Program, Variable
...@@ -72,7 +76,7 @@ def memory_usage(program, batch_size): ...@@ -72,7 +76,7 @@ def memory_usage(program, batch_size):
# Get the var_name list of first block and calculate # Get the var_name list of first block and calculate
total_memory = 0.0 total_memory = 0.0
for var in program.global_block().vars.itervalues(): for var in six.itervalues(program.global_block().vars):
data_count = 1 data_count = 1
for x in var.shape: for x in var.shape:
if x == -1: if x == -1:
...@@ -81,10 +85,10 @@ def memory_usage(program, batch_size): ...@@ -81,10 +85,10 @@ def memory_usage(program, batch_size):
data_count *= x data_count *= x
var_memory = data_count * dtype_to_size[var.dtype] var_memory = data_count * dtype_to_size[var.dtype]
if DEBUG: if DEBUG:
print "%s memory usage: %d" % (var.name, var_memory) print("%s memory usage: %d" % (var.name, var_memory))
total_memory += var_memory total_memory += var_memory
if DEBUG: if DEBUG:
print "total memory usage: %.2f" % (total_memory) print("total memory usage: %.2f" % (total_memory))
# Convert appropriate unit # Convert appropriate unit
unit_str = "B" unit_str = "B"
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from . import core from . import core
import numpy import numpy
import os import os
......
...@@ -12,7 +12,10 @@ ...@@ -12,7 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import sys import sys
import six
import re import re
from .graphviz import GraphPreviewGenerator from .graphviz import GraphPreviewGenerator
from .proto import framework_pb2 from .proto import framework_pb2
...@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
graph = GraphPreviewGenerator("some graph") graph = GraphPreviewGenerator("some graph")
# collect parameters and args # collect parameters and args
protostr = block.desc.serialize_to_string() protostr = block.desc.serialize_to_string()
desc = framework_pb2.BlockDesc.FromString(str(protostr)) desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
def need_highlight(name): def need_highlight(name):
if highlights is None: return False if highlights is None: return False
......
...@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be ...@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
invoked in a new local scope. invoked in a new local scope.
""" """
from __future__ import print_function
import paddle.fluid.core import paddle.fluid.core
import threading import threading
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册