提交 b0904794 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

......@@ -35,8 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
GIT_TAG "04256ba78fa3da0beb74e8036c8efd68c12824d6"
DEPENDS ${MKLML_PROJECT}
# Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
......
......@@ -115,6 +115,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(rw_lock_test SRCS rw_lock_test.cc)
# disable test temporarily.
# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <array>
#include <string>
#include <vector>
......
......@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stack>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
......
......@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
}
void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
// NOTICE(minqiyang): pybind11 will take the empty list in python as
// the std::vector<int> type in C++; so we have to change the attr's type
// here if we meet this issue
proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
if (attr_type == proto::AttrType::INTS &&
boost::get<std::vector<int>>(v).size() == 0u) {
// Find current attr via attr name and set the correct attribute value
const proto::OpProto::Attr &attr = GetProtoAttr(name);
switch (attr.type()) {
case proto::AttrType::BOOLEANS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BOOLEANS";
this->attrs_[name] = std::vector<bool>();
break;
}
case proto::AttrType::INTS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to INTS";
this->attrs_[name] = std::vector<int>();
break;
}
case proto::AttrType::FLOATS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
this->attrs_[name] = std::vector<float>();
break;
}
case proto::AttrType::STRINGS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to STRINGS";
this->attrs_[name] = std::vector<std::string>();
break;
}
case proto::AttrType::BLOCKS: {
VLOG(11) << "SetAttr: " << Type() << ", " << name
<< " from INTS to BLOCKS";
this->SetBlocksAttr(name, std::vector<BlockDesc *>());
return;
}
default:
PADDLE_THROW("Wrong attr type %d", attr.type());
}
need_update_ = true;
return;
}
this->attrs_[name] = v;
need_update_ = true;
}
......@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
return it->second;
}
const proto::OpProto::Attr &OpDesc::GetProtoAttr(
const std::string &name) const {
const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return attr;
}
}
PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
}
Attribute OpDesc::GetNullableAttr(const std::string &name) const {
auto it = attrs_.find(name);
if (it != attrs_.end()) {
......
......@@ -81,6 +81,8 @@ class OpDesc {
Attribute GetAttr(const std::string &name) const;
const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttrId(const std::string &name) const;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <pthread.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
struct RWLock {
RWLock() { pthread_rwlock_init(&lock_, nullptr); }
~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed");
}
void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed");
}
void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
}
private:
pthread_rwlock_t lock_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/rw_lock.h"
#include <gtest/gtest.h>
#include <chrono> // NOLINT
#include <thread> // NOLINT
#include <vector>
namespace f = paddle::framework;
void f1(f::RWLock *lock) {
lock->RDLock();
lock->UNLock();
}
TEST(RWLOCK, read_read) {
f::RWLock lock;
lock.RDLock();
std::thread t1(f1, &lock);
std::thread t2(f1, &lock);
t1.join();
t2.join();
lock.UNLock();
}
void f2(f::RWLock *lock, std::vector<int> *result) {
lock->RDLock();
ASSERT_EQ(result->size(), 0UL);
lock->UNLock();
}
void f3(f::RWLock *lock, std::vector<int> *result) {
lock->WRLock();
result->push_back(1);
lock->UNLock();
}
TEST(RWLOCK, read_write) {
f::RWLock lock;
std::vector<int> result;
lock.RDLock();
std::thread t1(f2, &lock, &result);
t1.join();
std::thread t2(f3, &lock, &result);
std::this_thread::sleep_for(std::chrono::seconds(1));
ASSERT_EQ(result.size(), 0UL);
lock.UNLock();
t2.join();
ASSERT_EQ(result.size(), 1UL);
}
void f4(f::RWLock *lock, std::vector<int> *result) {
lock->RDLock();
ASSERT_EQ(result->size(), 1UL);
lock->UNLock();
}
TEST(RWLOCK, write_read) {
f::RWLock lock;
std::vector<int> result;
lock.WRLock();
std::thread t1(f4, &lock, &result);
std::this_thread::sleep_for(std::chrono::seconds(1));
result.push_back(1);
lock.UNLock();
t1.join();
}
......@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
: true;
}
std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
const std::vector<int64_t>& keys, framework::Tensor* value) const {
int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
rwlock_->RDLock();
auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) {
rwlock_->UNLock();
if (!auto_grown) {
PADDLE_THROW("key %d not found", key);
}
rwlock_->WRLock();
auto map_size = id_to_index_.size();
auto vector_size = rows_.size();
if (map_size != vector_size) {
rwlock_->UNLock();
PADDLE_THROW(
"id_to_index_ size %d should have the same size with rows_ %d",
map_size, vector_size);
}
auto write_iter = id_to_index_.find(key);
if (write_iter == id_to_index_.end()) {
size_t row_num = rows_.size();
if (row_num == value_->dims()[0]) {
rwlock_->UNLock();
PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
}
// key logic to put a key into id_to_index_
rows_.push_back(key);
auto index = static_cast<int64_t>(rows_.size() - 1);
id_to_index_[key] = index;
rwlock_->UNLock();
return index;
} else {
auto index = write_iter->second;
rwlock_->UNLock();
return index;
}
} else {
auto index = iter->second;
rwlock_->UNLock();
return index;
}
}
void SelectedRows::SyncIndex() {
rwlock_->WRLock();
id_to_index_.clear();
for (size_t i = 0; i < rows_.size(); ++i) {
id_to_index_[rows_[i]] = i;
}
rwlock_->UNLock();
}
void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
bool auto_grown) {
PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized.");
std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
if (keys.empty()) {
if (ids.numel() == 0) {
VLOG(3) << "keys is empty, please check data!";
} else {
int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table "
"except the dims[0].");
for (size_t i = 0; i < keys.size(); ++i) {
int64_t index = Index(keys[i]);
if (index == -1) {
non_keys_pair.push_back(
std::make_pair(keys[i], static_cast<int64_t>(i)));
} else {
framework::VisitDataType(
framework::ToDataType(value_->type()),
TensorCopyVisitor(value, i * value_width, *value_.get(),
index * value_width, value_width));
}
for (size_t i = 0; i < ids.numel(); ++i) {
int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
framework::VisitDataType(
framework::ToDataType(value_->type()),
TensorCopyVisitor(value, i * value_width, *value_.get(),
index * value_width, value_width));
}
}
return non_keys_pair;
}
bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
if (value_->IsInitialized()) {
PADDLE_ENFORCE_EQ(
value.type(), value_->type(),
"The type of the value should be same with the original value");
}
PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
"The first dim of value should be 1.");
std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
auto index = Index(key);
bool is_new_key = false;
if (index == -1) {
rows_.push_back(key);
index = rows_.size() - 1;
is_new_key = true;
// whether need to resize the table
if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
auto dims = value_->dims();
dims[0] = (dims[0] + 1) << 1;
framework::VisitDataType(framework::ToDataType(value.type()),
ReAllocateVisitor(dims, value_.get()));
}
}
framework::VisitDataType(
framework::ToDataType(value.type()),
TensorCopyVisitor(value_.get(),
index * value_->numel() / value_->dims()[0], value,
static_cast<int64_t>(0), value.numel()));
return is_new_key;
}
} // namespace framework
......
......@@ -17,10 +17,12 @@ limitations under the License. */
#include <algorithm>
#include <memory>
#include <mutex> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/memcpy.h"
......@@ -48,13 +50,13 @@ class SelectedRows {
SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
: rows_(rows), height_(height) {
value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex);
rwlock_.reset(new RWLock);
}
SelectedRows() {
height_ = 0;
value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex);
rwlock_.reset(new RWLock);
}
platform::Place place() const { return value_->place(); }
......@@ -74,47 +76,51 @@ class SelectedRows {
void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
/*
* @brief wheter has the specified key in the table.
* @brief Get the index of key in rows
*
* @return -1 if the key does not exists.
*/
int64_t Index(int64_t key) const {
auto it = std::find(rows_.begin(), rows_.end(), key);
if (it == rows_.end()) {
PADDLE_THROW("id %s not in table", key);
}
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
/*
* @brief whether has the specified key in the table.
*
* @return true if the key is exists.
*/
bool HasKey(int64_t key) const;
/*
* @brief Get value by the key list, if the
* @brief Get value by the key list.
* Note!!! this interface is only used when selected_rows is used as
* parameters
* for distribute lookup table.
*
* @return a list of pair which contains the non-exists key and the index in
* the value
*/
std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
framework::Tensor* value) const;
void Get(const framework::Tensor& ids, framework::Tensor* value,
bool auto_grown = false);
/*
* @brief Set a key-value pair into the table.
* This function will double the value memory if it's not engouth.
* @brief Get the index of the key from id_to_index_ map. If the key not
* exist,
* add the key into id_to_index_.
*
* @note:
* 1. The first dim of the value should be 1
* 2. The value should be initialized and the data type
* should be the same with the table.
*
* @return true if the key is a new one, otherwise false
* Note!!! this interface is only used when selected_rows is used as
* parameters
* for distribute lookup table.
*
* @return index of the key.
*/
bool Set(int64_t key, const Tensor& value);
int64_t AutoGrownIndex(int64_t key, bool auto_grown);
/*
* @brief Get the index of key in rows
*
* @return -1 if the key does not exists.
*/
int64_t Index(int64_t key) const {
auto it = std::find(rows_.begin(), rows_.end(), key);
if (it == rows_.end()) {
return static_cast<int64_t>(-1);
}
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
void SyncIndex();
DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims());
......@@ -127,9 +133,10 @@ class SelectedRows {
// SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled.
Vector<int64_t> rows_;
std::unordered_map<int64_t, int64_t> id_to_index_;
std::unique_ptr<Tensor> value_{nullptr};
int64_t height_;
std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
std::unique_ptr<RWLock> rwlock_{nullptr};
};
/*
......
......@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include <time.h>
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace paddle {
namespace framework {
......@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
}
TEST_F(SelectedRowsTester, SparseTable) {
TEST(SelectedRows, SparseTable) {
platform::CPUPlace cpu;
SelectedRows table;
int64_t table_size = 100;
int64_t embedding_width = 8;
// initialize a sparse table
table.mutable_value()->Resize(framework::make_ddim({1, 100}));
table.mutable_value()->mutable_data<float>(cpu);
table.mutable_rows()->push_back(1);
table.mutable_value()->Resize(
framework::make_ddim({table_size, embedding_width}));
auto* data = table.mutable_value()->mutable_data<float>(cpu);
for (int64_t i = 0; i < table_size; ++i) {
for (int64_t j = 0; j < embedding_width; ++j) {
data[i * embedding_width + j] = static_cast<float>(i);
}
}
ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
ASSERT_TRUE(table.HasKey(10));
ASSERT_TRUE(table.HasKey(8));
ASSERT_TRUE(table.HasKey(6));
ASSERT_EQ(table.rows().size(), 3);
framework::Tensor ids;
ids.Resize(framework::make_ddim({4}));
auto* ids_data = ids.mutable_data<int64_t>(cpu);
ids_data[0] = static_cast<int64_t>(6);
ids_data[1] = static_cast<int64_t>(6);
ids_data[2] = static_cast<int64_t>(8);
ids_data[3] = static_cast<int64_t>(10);
int64_t key = 10000;
int64_t non_key = 999;
framework::Tensor value;
value.Resize(framework::make_ddim({1, 100}));
auto ptr = value.mutable_data<float>(cpu);
ptr[0] = static_cast<float>(10);
framework::Tensor get_value;
auto* value_data = get_value.mutable_data<float>(
framework::make_ddim({4, embedding_width}), cpu);
table.Get(ids, &get_value);
ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
ASSERT_EQ(table.HasKey(key), false);
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[0 * embedding_width + j], 2);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[1 * embedding_width + j], 2);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[2 * embedding_width + j], 1);
}
for (int j = 0; j < embedding_width; ++j) {
ASSERT_EQ(value_data[3 * embedding_width + j], 0);
}
}
table.Set(key, value);
void f1(SelectedRows* table, int table_size) {
for (int i = 1000000; i > 0; --i) {
auto id = i % table_size;
int64_t index1 = table->AutoGrownIndex(id, true);
int64_t index2 = table->AutoGrownIndex(id, false);
int64_t index3 = table->AutoGrownIndex(id, true);
ASSERT_EQ(index1, index2);
ASSERT_EQ(index2, index3);
}
}
ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
ASSERT_EQ(table.HasKey(key), true);
// check re-allocate
ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
void f2(SelectedRows* table, int table_size) {
for (int i = 0; i < 1000000; ++i) {
auto id = i % table_size;
int64_t index1 = table->AutoGrownIndex(id, true);
int64_t index2 = table->AutoGrownIndex(id, false);
int64_t index3 = table->AutoGrownIndex(id, true);
ASSERT_EQ(index1, index2);
ASSERT_EQ(index2, index3);
}
}
framework::Tensor get_value;
get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
std::vector<int64_t> keys({non_key, key});
auto non_key_pairs = table.Get(keys, &get_value);
void f3(SelectedRows* table, int table_size) {
clock_t t1 = clock();
for (int i = 100000; i > 0; --i) {
auto id1 = table->AutoGrownIndex(i % table_size, true);
auto id2 = table->Index(i % table_size);
ASSERT_EQ(id1, id2);
}
clock_t t2 = clock();
std::cout << "f3 run time:" << t2 - t1 << std::endl;
}
void f4(SelectedRows* table, int table_size) {
clock_t t1 = clock();
for (int i = 0; i < 100000; ++i) {
auto id1 = table->AutoGrownIndex(i % table_size, true);
auto id2 = table->Index(i % table_size);
ASSERT_EQ(id1, id2);
}
clock_t t2 = clock();
std::cout << "f4 run time:" << t2 - t1 << std::endl;
}
TEST(SelectedRows, MultiThreadAutoIndex) {
platform::CPUPlace cpu;
SelectedRows table;
int64_t table_size = 100000;
int64_t embedding_width = 8;
// initialize a sparse table
table.mutable_value()->Resize(
framework::make_ddim({table_size, embedding_width}));
auto* data = table.mutable_value()->mutable_data<float>(cpu);
for (int64_t i = 0; i < table_size; ++i) {
for (int64_t j = 0; j < embedding_width; ++j) {
data[i * embedding_width + j] = static_cast<float>(i);
}
}
ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
ASSERT_EQ(non_key_pairs[0].first, non_key);
std::thread t1(f1, &table, table_size);
std::thread t11(f1, &table, table_size);
std::thread t2(f2, &table, table_size);
std::thread t22(f2, &table, table_size);
t1.join();
t11.join();
t2.join();
t22.join();
std::thread t3(f3, &table, table_size);
std::thread t4(f4, &table, table_size);
t3.join();
t4.join();
}
} // namespace framework
......
......@@ -8,7 +8,7 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
helper.cc
model_store_pass.cc
DEPS framework_proto proto_desc)
cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
......
......@@ -20,17 +20,6 @@ namespace paddle {
namespace inference {
namespace analysis {
template <>
std::string &NodeAttr::As<std::string>() {
if (data_.empty()) {
type_index_ = std::type_index(typeid(std::string));
}
PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
return data_;
}
std::string &NodeAttr::String() { return As<std::string>(); }
std::vector<Dot::Attr> Value::dot_attrs() const {
return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
Dot::Attr("shape", "box"),
......
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/device.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace inference {
......@@ -38,39 +39,35 @@ class NodeMap;
// A helper class to maintain the status from Pass.
struct NodeAttr {
using any_t =
boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
// NOTE T should be a primary type or a struct combined by several primary
// types.
// NOTE the STL containers should not use here.
// Some usages
// Attr attr;
// attr.Bool() = true;
bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); }
std::string &String();
std::string &String() { return As<std::string>(); }
private:
template <typename T>
T &As() {
// init storage in the first usage.
if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T);
type_index_ = std::type_index(typeid(T));
data_.resize(sizeof(T));
if (type_index_ == typeid(NodeAttr)) {
type_index_ = typeid(T);
any_data_ = T();
} else {
PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
}
PADDLE_ENFORCE(framework::IsType<T>(type_index_),
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_index_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
return boost::get<T>(any_data_);
}
private:
std::string data_;
any_t any_data_;
std::type_index type_index_{typeid(NodeAttr)};
};
......
......@@ -20,6 +20,24 @@ namespace paddle {
namespace inference {
namespace analysis {
TEST(NodeAttr, bool) {
NodeAttr x;
x.Bool() = true;
ASSERT_EQ(x.Bool(), true);
}
TEST(NodeAttr, int32) {
NodeAttr x;
x.Int32() = 32;
ASSERT_EQ(x.Int32(), 32);
}
TEST(NodeAttr, string) {
NodeAttr x;
x.String() = "Hello";
ASSERT_EQ(x.String(), "Hello");
}
TEST(Node, Attr) {
// Node is an abstract class, use Value instead for they share the same Attr
// logic.
......@@ -27,6 +45,9 @@ TEST(Node, Attr) {
auto* node = nodes.Create(Node::Type::kValue);
node->attr("v0").Int32() = 2008;
ASSERT_EQ(node->attr("v0").Int32(), 2008);
node->attr("str").String() = "hello world";
ASSERT_EQ(node->attr("str").String(), "hello world");
}
} // namespace analysis
......
......@@ -13,16 +13,22 @@ else
use_gpu_list='false'
fi
PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
# download vis_demo data
function download() {
dir_name=$1
mkdir -p $dir_name
cd $dir_name
wget -q ${URL_ROOT}$dir_name.tar.gz
tar xzf *.tar.gz
if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
else
wget -q ${URL_ROOT}$dir_name.tar.gz
tar xzf *.tar.gz
fi
cd ..
}
URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
mkdir -p data
cd data
vis_demo_list='se_resnext50 ocr mobilenet'
......
......@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
......@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = static_cast<T>(0.5) * dout / out;
const Out out_conj = Eigen::numext::conj(out);
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
}
};
......@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor) - static_cast<T>(1));
x.pow(static_cast<T>(factor - static_cast<T>(1)));
}
};
......@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
T b = static_cast<T>(beta);
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
dx.device(d) = dout * ((b * out) + temp2);
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((beta * out) + temp2);
}
};
......
......@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
ops::AssignValueKernel<float>,
ops::AssignValueKernel<plat::float16>);
ops::AssignValueKernel<float>);
......@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
template <typename T, typename DeviceContext>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool EnableFp16(const DeviceContext& dev_ctx,
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
return true;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
return false;
}
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
......@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
......@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
......@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
paddle::operators::CUDNNConvGradOpKernel<double>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
......@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
paddle::operators::CUDNNConvGradOpKernel<double>);
......@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
using CUDACtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>,
ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
ops::CrossEntropyOpKernel<CUDACtx, double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
......@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
auto rows = w->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
auto w_value = w->mutable_value();
w_value->Resize({rows_numel, 10});
for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
auto ptr = w_value->mutable_data<float>(*place);
......
......@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
......@@ -14,24 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
int64_t>);
......@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
......@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int j = blockIdx.x;
int i = threadIdx.x;
int tid = threadIdx.x;
T val(0);
T val = 0;
do {
int x_offset = i * w + j;
......@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x;
int j = blockIdx.x;
T val(0);
T val = 0;
int ttid = tid;
while (true) {
......
......@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
......@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
class FillConstantOp : public framework::OperatorWithKernel {
class FillConstantInferShape : public framework::InferShapeBase {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null.");
auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
};
class FillConstantOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
auto value = Attr<float>("value");
auto force_cpu = Attr<bool>("force_cpu");
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out.mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context());
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
math::set_constant(dev_ctx, &out, value);
}
};
......@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
ops::FillConstantInferShape, ops::FillConstantOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fill_constant,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FillConstantOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto value = ctx.Attr<float>("value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto* out = ctx.Output<framework::Tensor>("Out");
out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
if (force_cpu) {
auto cpu = platform::CPUPlace();
out->mutable_data(cpu, framework::ToTypeIndex(data_type));
} else {
out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
}
math::set_constant(ctx.template device_context<DeviceContext>(), out,
value);
}
};
} // namespace operators
} // namespace paddle
......@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
......@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
framework::VisitDataType(
dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out
platform::DeviceContextPool &pool =
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
......@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(gaussian_random,
paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>);
......
......@@ -17,7 +17,6 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
......@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
auto out_var = scope.FindVar(Output("Out"));
auto w_var = scope.FindVar(Input("W"));
auto ids_var = scope.FindVar(Input("Ids"));
unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
float min = Attr<float>("min");
float max = Attr<float>("max");
bool auto_grown_table = Attr<bool>("auto_grown_table");
PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
"The type of Out var should be LodTensor.");
......@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
auto &ids_t = ids_var->Get<framework::LoDTensor>();
auto out_t = out_var->GetMutable<framework::LoDTensor>();
auto w_t = w_var->GetMutable<framework::SelectedRows>();
std::vector<int64_t> keys;
keys.resize(ids_t.numel());
for (int64_t i = 0; i < ids_t.numel(); ++i) {
keys[i] = ids_t.data<int64_t>()[i];
}
// TODO(Yancey1989): support CUDA Place for the sparse table
platform::CPUPlace cpu;
auto out_shape = w_t->value().dims();
out_shape[0] = keys.size();
out_shape[0] = ids_t.numel();
out_t->Resize(out_shape);
out_t->mutable_data(cpu, w_t->value().type());
PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
framework::proto::VarType::FP32,
"The sparse table only support FP32");
auto non_keys_pair = w_t->Get(keys, out_t);
if (!auto_grown_table) {
PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
"there is some keys does exists in the sparse table.");
}
auto value_shape = w_t->value().dims();
value_shape[0] = 1;
for (const auto &it : non_keys_pair) {
const auto key = it.first;
const auto index = it.second;
framework::Tensor value;
value.Resize(value_shape);
auto data = value.mutable_data<float>(cpu);
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<float> dist(min, max);
int64_t size = value.numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
w_t->Set(key, value);
memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
cpu, value.data<float>(), value.numel() * sizeof(float));
}
w_t->Get(ids_t, out_t, true);
}
};
......@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding);
AddAttr<float>("min",
"(float, default -1.0) "
"Minimum value of uniform random")
.SetDefault(-1.0f);
AddAttr<float>("max",
"(float, default 1.0) "
"Maximum value of uniform random")
.SetDefault(1.0f);
AddAttr<int>("seed",
"(int, default 0) "
"Random seed used for generating samples. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.")
.SetDefault(0);
AddAttr<bool>("auto_grown_table",
"(bool default false)"
"Whether create new value if for nonexistent key.")
......
......@@ -15,25 +15,11 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
HOSTDEVICE T log(const T& val) {
return std::log(val);
}
template <>
HOSTDEVICE platform::float16 log(const platform::float16& val) {
// strage bug, hlog is not exists.
return static_cast<float16>(0);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace {
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
......@@ -49,12 +35,12 @@ template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) {
int tid = threadIdx.x;
T val(0);
T val = 0;
int idx = blockIdx.x * class_num + tid;
int end = blockIdx.x * class_num + class_num;
for (; idx < end; idx += blockDim.x) {
val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
}
val = paddle::platform::reduceSum(val, tid, blockDim.x);
......@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
......@@ -35,21 +33,6 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using paddle::platform::float16;
using paddle::platform::isfinite;
template <>
struct TolerableValue<float16> {
HOSTDEVICE float16 operator()(const float16& x) const {
if (isfinite(x))
return x;
else if (x > static_cast<float16>(0))
return std::numeric_limits<float16>::max();
else
return std::numeric_limits<float16>::min();
}
};
template <typename DeviceContext, typename T>
class CrossEntropyFunctor {
public:
......
......@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
......@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
namespace {
template <typename T, int block_size>
......@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto* out_data = output->data<T>();
SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0));
functor(context, output, 0.0);
const int block_size = 256;
dim3 threads(block_size, 1);
......@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;
template <typename T>
struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
......@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddTo<platform::CUDADeviceContext,
platform::float16>;
namespace {
template <typename T, int block_size>
......@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
namespace scatter {
......@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), static_cast<T>(0));
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
......@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows,
......
......@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template class SoftmaxCUDNNFunctor<platform::float16>;
template class SoftmaxCUDNNFunctor<float>;
template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<platform::float16>;
template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext,
platform::float16>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
......
......@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG->mutable_data<T>(context.GetPlace());
T ig_size = static_cast<T>(IG->numel());
Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
Eigen::DSizes<int, 1> bcast(ig_size);
EigenVector<T>::Flatten(*IG).device(
*context.template device_context<DeviceContext>().eigen_device()) =
......
......@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>,
ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>);
......@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>,
ops::PoolCUDNNGradOpKernel<plat::float16>);
ops::PoolCUDNNGradOpKernel<double>);
REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>,
......@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>,
ops::PoolCUDNNGradOpKernel<plat::float16>);
ops::PoolCUDNNGradOpKernel<double>);
......@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
AddComment(R"DOC(
Recv operator
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sampling_id_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class SamplingIdOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SamplingIdOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SamplingIdOp should not be null.");
PADDLE_ENFORCE(
ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
"min must less then max");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 2,
"Input(X, Filter) should be 2-D tensor.");
framework::DDim dims = input_dims;
ctx->SetOutputDim("Out", dims);
ctx->ShareLoD("X", "Out");
}
};
class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of softmax. "
"2-D with shape [batch_size, input_feature_dimensions].");
AddOutput("Out", "SamplingId data tensor.");
AddComment(R"DOC(
SamplingId Operator.
A layer for sampling id from multinomial distribution from the
input. Sampling one id for one sample.)DOC");
AddAttr<float>("min", "Minimum value of random. [default 0.0].")
.SetDefault(0.0f);
AddAttr<float>("max", "Maximun value of random. [default 1.0].")
.SetDefault(1.0f);
AddAttr<int>("seed",
"Random seed used for the random number engine. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time. [default 0].")
.SetDefault(0);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
paddle::operators::SamplingIdKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sampling_id_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
paddle::operators::SamplingIdKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <random>
#include <sstream>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SamplingIdKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("X");
const int batch_size = static_cast<int>(input->dims()[0]);
const int width = static_cast<int>(input->dims()[1]);
PADDLE_ENFORCE_GE(batch_size, 0,
"batch_size(dims[0]) must be nonnegative.");
PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
std::vector<T> ins_vector;
framework::TensorToVector(*input, context.device_context(), &ins_vector);
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::uniform_real_distribution<T> dist(
static_cast<T>(context.Attr<float>("min")),
static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size);
for (size_t i = 0; i < batch_size; ++i) {
T r = dist(engine);
int idx = width - 1;
for (int j = 0; j < width; ++j) {
if ((r -= ins_vector[i * width + j]) < 0) {
idx = j;
break;
}
}
ids[i] = ins_vector[i * width + idx];
}
std::vector<int64_t> out_dim;
out_dim.push_back(static_cast<int64_t>(batch_size));
Tensor* output = context.Output<Tensor>("Out");
output->Resize(framework::make_ddim(out_dim));
output->mutable_data<T>(context.GetPlace());
framework::TensorFromVector(ids, context.device_context(), output);
}
};
} // namespace operators
} // namespace paddle
......@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
scale,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
int64_t>);
......@@ -37,22 +37,19 @@ class SendBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode");
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
VLOG(3) << "SendBarrierOp sync";
// need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
if (sync_mode) {
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
}
};
......@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
}
};
......
......@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
Send operator
......
......@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < grad.rows().size(); i++) {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height");
int64_t id_index = param.Index(grad.rows()[i]);
int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
"id should be in the table");
for (int64_t j = 0; j < grad_row_width; j++) {
......
......@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>,
ops::SoftmaxGradCUDNNKernel<plat::float16>);
ops::SoftmaxGradCUDNNKernel<float>);
......@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
......@@ -11,13 +11,10 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if (!in_place) {
math::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(), out,
static_cast<T>(0));
0.0);
}
math::SelectedRowsAddToTensor<DeviceContext, T> functor;
......
......@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using paddle::platform::float16;
template <typename T>
struct Pair {
......@@ -35,11 +32,6 @@ struct Pair {
id = id;
}
__device__ __forceinline__ void clear() {
v = -INFINITY;
id = -1;
}
__device__ __forceinline__ void operator=(const Pair<T>& in) {
v = in.v;
id = in.id;
......@@ -61,12 +53,6 @@ struct Pair {
int64_t id;
};
template <>
__device__ __forceinline__ void Pair<float16>::clear() {
v = platform::raw_uint16_to_float16(0x400);
id = -1;
}
template <typename T>
__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
int beam_size) {
......@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].clear();
topk[k].set(-INFINITY, -1);
}
}
if (!(*is_empty)) {
......@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*max = topk[MaxLength - 1];
if ((*max).v == static_cast<T>(-1)) *is_empty = true;
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
......@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(std::numeric_limits<T>::min(), -1);
topk[k].set(-INFINITY, -1);
}
}
if (!(*is_empty)) {
......@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool firststep = true;
for (int k = 0; k < MaxLength; k++) {
topk[k].clear();
topk[k].set(-INFINITY, -1);
}
while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
......@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>);
......@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = ctx.Attr<std::vector<int>>("shape");
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
tensor = selected_rows->mutable_value();
tensor->Resize(framework::make_ddim(shape));
selected_rows->mutable_rows()->reserve(shape[0]);
} else {
PADDLE_THROW(
"uniform_random_op's output only"
......
......@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
......@@ -40,11 +36,6 @@ struct UniformGenerator {
}
};
template <typename T, typename V>
struct CastFunctor {
HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
......@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T max = static_cast<T>(context.Attr<float>("max"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
int64_t size = tensor->numel();
if (out_var->IsType<framework::LoDTensor>() &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
framework::Tensor master_copy_tensor;
master_copy_tensor.Resize(tensor->dims());
float* master_copy_tensor_data =
master_copy_tensor.mutable_data<float>(context.GetPlace());
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<float>(master_copy_tensor_data),
UniformGenerator<float>(static_cast<float>(min),
static_cast<float>(max), seed));
platform::Transform<platform::CUDADeviceContext> trans;
auto* in_begin = master_copy_tensor.data<float>();
auto* in_end = in_begin + master_copy_tensor.numel();
auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
trans(context.template device_context<platform::CUDADeviceContext>(),
in_begin, in_end, out_begin, CastFunctor<float, T>());
} else {
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
UniformGenerator<T>(min, max, seed));
}
if (VLOG_IS_ON(5)) {
framework::Tensor cpu_tensor;
framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
auto& dev_ctx =
*platform::DeviceContextPool::Instance().Get(context.GetPlace());
dev_ctx.Wait();
auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
VLOG(5) << "The Uniform output " << x;
}
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data),
UniformGenerator<T>(min, max, seed));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>,
paddle::operators::GPUUniformRandomKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(
uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>,
paddle::operators::GPUUniformRandomKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(uniform_random,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
......@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
#if CUDA_VERSION < 9000
return __shfl_down(val, delta, width);
#else
return __shfl_down_sync(mask, val, delta, width);
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
#endif
}
......@@ -46,9 +46,16 @@ template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
half tmp = static_cast<half>(val);
__shfl_down(tmp, static_cast<unsigned>(delta), width);
return float16(tmp);
return float16(
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
}
#else
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
return float16(__shfl_down_sync(mask, static_cast<half>(val),
static_cast<unsigned>(delta), width));
}
#endif
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <random>
......@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(out[i]),
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
......@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
}
// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
template <typename T>
static __forceinline__ __device__ T WarpReduceSum(T val) {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
}
return val;
}
template <typename T>
__forceinline__ __device__ T BlockReduce(T val) {
static __shared__ T shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = WarpReduceSum(val); // Each warp performs partial reduction
if (lane == 0) shared[wid] = val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
// read from shared memory only if that warp existed
val =
(threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp
return val;
}
template <typename T>
__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
T sum(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
sum += in[i];
}
sum = BlockReduce<T>(sum);
__syncthreads();
if (threadIdx.x == 0) out[blockIdx.x] = sum;
}
template <typename T>
void TestReduce(size_t num, float atol = 0.01) {
T* in1;
T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
in1 = reinterpret_cast<T*>(malloc(size));
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (size_t i = 0; i < num; ++i) {
in1[i] = static_cast<T>(dist(engine));
}
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
free(in1);
cudaFree(d_in1);
cudaFree(d_in2);
}
TEST(CudaShuffleSync, float16) {
TestReduce<float>(10);
TestReduce<float>(1000);
// float16 will overflow or accumulate truncate errors in big size.
TestReduce<float16>(10);
TestReduce<float16>(100, /*atol error*/ 1.0);
}
......@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) {
void BindVarDsec(pybind11::module *m) {
pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
var_desc
.def("name",
[](pd::VarDesc &self) {
pybind11::bytes name = self.Name();
return name;
},
pybind11::return_value_policy::reference)
.def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
.def("set_name", &pd::VarDesc::SetName)
.def("set_shape", &pd::VarDesc::SetShape)
.def("set_shapes", &pd::VarDesc::SetShapes)
......
......@@ -54,6 +54,8 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#include "pybind11/stl.h"
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
......@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) {
self.set_rows(new_rows);
#endif
})
.def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
.def("rows", [](SelectedRows &self) {
auto rows = self.rows();
std::vector<int64_t> new_rows;
......
......@@ -54,7 +54,7 @@ function cpu_config() {
if [ $platform == "Linux" ]; then
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
elif [ $platform == "Darwin" ]; then
if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
# HT is OFF
ht=1
fi
......
......@@ -24,4 +24,5 @@ except ImportError:
import paddle.reader
import paddle.dataset
import paddle.batch
import paddle.compat
batch = batch.batch
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import math
__all__ = [
'long_type',
'to_text',
'to_bytes',
'round',
'floor_division',
'get_exception_message',
]
if six.PY2:
int_type = int
long_type = long
else:
int_type = int
long_type = int
# str and bytes related functions
def to_text(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a literal string without any encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to literal string.
In Python3:
Decode the bytes type object to str type with specific encoding
In Python2:
Decode the str type object to unicode type with specific encoding
Args:
obj(unicode|str|bytes|list|set) : The object to be decoded.
encoding(str) : The encoding format to decode a string
inplace(bool) : If we change the original object or we create a new one
Returns:
Decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, list):
if inplace:
for i in six.moves.xrange(len(obj)):
obj[i] = _to_text(obj[i], encoding)
return obj
else:
return [_to_text(item, encoding) for item in obj]
elif isinstance(obj, set):
if inplace:
for item in obj:
obj.remove(item)
obj.add(_to_text(item, encoding))
return obj
else:
return set([_to_text(item, encoding) for item in obj])
else:
return _to_text(obj, encoding)
def _to_text(obj, encoding):
"""
In Python3:
Decode the bytes type object to str type with specific encoding
In Python2:
Decode the str type object to unicode type with specific encoding,
or we just return the unicode string of object
Args:
obj(unicode|str|bytes) : The object to be decoded.
encoding(str) : The encoding format
Returns:
decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, six.binary_type):
return obj.decode(encoding)
elif isinstance(obj, six.text_type):
return obj
else:
return six.u(obj)
def to_bytes(obj, encoding='utf-8', inplace=False):
"""
All string in PaddlePaddle should be represented as a literal string.
This function will convert object to a bytes with specific encoding.
Especially, if the object type is a list or set container, we will iterate
all items in the object and convert them to bytes.
In Python3:
Encode the str type object to bytes type with specific encoding
In Python2:
Encode the unicode type object to str type with specific encoding,
or we just return the 8-bit string of object
Args:
obj(unicode|str|bytes|list|set) : The object to be encoded.
encoding(str) : The encoding format to encode a string
inplace(bool) : If we change the original object or we create a new one
Returns:
Decoded result of obj
"""
if obj is None:
return obj
if isinstance(obj, list):
if inplace:
for i in six.moves.xrange(len(obj)):
obj[i] = _to_bytes(obj[i], encoding)
return obj
else:
return [_to_bytes(item, encoding) for item in obj]
elif isinstance(obj, set):
if inplace:
for item in obj:
obj.remove(item)
obj.add(_to_bytes(item, encoding))
return obj
else:
return set([_to_bytes(item, encoding) for item in obj])
else:
return _to_bytes(obj, encoding)
def _to_bytes(obj, encoding):
"""
In Python3:
Encode the str type object to bytes type with specific encoding
In Python2:
Encode the unicode type object to str type with specific encoding,
or we just return the 8-bit string of object
Args:
obj(unicode|str|bytes) : The object to be encoded.
encoding(str) : The encoding format
Returns:
encoded result of obj
"""
if obj is None:
return obj
assert encoding is not None
if isinstance(obj, six.text_type):
return obj.encode(encoding)
elif isinstance(obj, six.binary_type):
return obj
else:
return six.b(obj)
# math related functions
def round(x, d=0):
"""
Compatible round which act the same behaviour in Python3.
Args:
x(float) : The number to round halfway.
Returns:
round result of x
"""
if six.PY3:
# The official walkaround of round in Python3 is incorrect
# we implement accroding this answer: https://www.techforgeek.info/round_python.html
if x > 0.0:
p = 10**d
return float(math.floor((x * p) + math.copysign(0.5, x))) / p
elif x < 0.0:
p = 10**d
return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
else:
return math.copysign(0.0, x)
else:
import __builtin__
return __builtin__.round(x, d)
def floor_division(x, y):
"""
Compatible division which act the same behaviour in Python3 and Python2,
whose result will be a int value of floor(x / y) in Python3 and value of
(x / y) in Python2.
Args:
x(int|float) : The number to divide.
y(int|float) : The number to be divided
Returns:
division result of x // y
"""
return x // y
# exception related functions
def get_exception_message(exc):
"""
Get the error message of a specific exception
Args:
exec(Exception) : The exception to get error message.
Returns:
the error message of exec
"""
assert exc is not None
if six.PY2:
return exc.message
else:
return str(exc)
......@@ -28,11 +28,13 @@ images per class.
"""
from __future__ import print_function
import itertools
import numpy
import paddle.dataset.common
import tarfile
from six.moves import zip
import six
from six.moves import cPickle as pickle
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
......@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
def reader_creator(filename, sub_name, cycle=False):
def read_batch(batch):
data = batch['data']
labels = batch.get('labels', batch.get('fine_labels', None))
data = batch[six.b('data')]
labels = batch.get(
six.b('labels'), batch.get(six.b('fine_labels'), None))
assert labels is not None
for sample, label in zip(data, labels):
for sample, label in six.moves.zip(data, labels):
yield (sample / 255.0).astype(numpy.float32), int(label)
def reader():
......@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False):
while True:
for name in names:
batch = pickle.load(f.extractfile(name))
if six.PY2:
batch = pickle.load(f.extractfile(name))
else:
batch = pickle.load(
f.extractfile(name), encoding='bytes')
for item in read_batch(batch):
yield item
if not cycle:
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import requests
import hashlib
import os
......@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None):
total_length = r.headers.get('content-length')
if total_length is None:
with open(filename, 'w') as f:
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
else:
with open(filename, 'w') as f:
with open(filename, 'wb') as f:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
......
......@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
to initialize SRL model.
"""
from __future__ import print_function
import tarfile
import gzip
import itertools
import paddle.dataset.common
from six.moves import zip
import paddle.compat as cpt
from six.moves import zip, range
__all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
......@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name):
labels = []
one_seg = []
for word, label in zip(words_file, props_file):
word = word.strip()
label = label.strip().split()
word = cpt.to_text(word.strip())
label = cpt.to_text(label.strip().split())
if len(label) == 0: # end of sentence
for i in range(len(one_seg[0])):
......
......@@ -28,6 +28,9 @@ Graphics and Image Processing (2008)
http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
"""
from __future__ import print_function
import itertools
import functools
from .common import download
......@@ -116,7 +119,7 @@ def reader_creator(data_file,
for file in open(file_list):
file = file.strip()
batch = None
with open(file, 'r') as f:
with open(file, 'rb') as f:
batch = pickle.load(f)
data = batch['data']
labels = batch['label']
......
......@@ -29,10 +29,18 @@ the image layout as follows.
formats can be used for training. Noted that, the format should
be keep consistent between the training and inference peroid.
"""
from __future__ import print_function
import numpy as np
try:
import cv2
except ImportError:
import sys
sys.stderr.write(
'''Warning with paddle image module: opencv-python should be imported,
or paddle image module could NOT work; please install opencv-python first.'''
)
cv2 = None
import os
import tarfile
......@@ -56,7 +64,7 @@ def batch_images_from_tar(data_file,
:type data_file: string
:param dataset_name: 'train','test' or 'valid'
:type dataset_name: string
:param img2label: a dic with image file name as key
:param img2label: a dic with image file name as key
and image's label as value
:type img2label: dic
:param num_per_batch: image number per batch file
......@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file,
output['data'] = data
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
file_id += 1
data = []
......@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file,
output['data'] = data
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'w'),
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta:
......@@ -113,7 +121,7 @@ def load_image_bytes(bytes, is_color=True):
Load an color or gray image from bytes array.
Example usage:
.. code-block:: python
with open('cat.jpg') as f:
......@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True):
load and return a gray image.
:type is_color: bool
"""
assert cv2 is not None
flag = 1 if is_color else 0
file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
img = cv2.imdecode(file_bytes, flag)
......@@ -137,7 +147,7 @@ def load_image(file, is_color=True):
Load an color or gray image from the file path.
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
......@@ -149,6 +159,8 @@ def load_image(file, is_color=True):
load and return a gray image.
:type is_color: bool
"""
assert cv2 is not None
# cv2.IMAGE_COLOR for OpenCV3
# cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
# cv2.IMAGE_GRAYSCALE for OpenCV3
......@@ -161,27 +173,29 @@ def load_image(file, is_color=True):
def resize_short(im, size):
"""
"""
Resize an image so that the length of shorter edge is size.
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
im = resize_short(im, 256)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the shorter edge size of image after resizing.
:type size: int
"""
assert cv2 is not None
h, w = im.shape[:2]
h_new, w_new = size, size
if h > w:
h_new = size * h / w
h_new = size * h // w
else:
w_new = size * w / h
w_new = size * w // h
im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
return im
......@@ -193,17 +207,17 @@ def to_chw(im, order=(2, 0, 1)):
according the order (2,0,1).
Example usage:
.. code-block:: python
im = load_image('cat.jpg')
im = resize_short(im, 256)
im = to_chw(im)
:param im: the input image with HWC layout.
:type im: ndarray
:param order: the transposed order.
:type order: tuple|list
:type order: tuple|list
"""
assert len(im.shape) == len(order)
im = im.transpose(order)
......@@ -215,11 +229,11 @@ def center_crop(im, size, is_color=True):
Crop the center of image with size.
Example usage:
.. code-block:: python
im = center_crop(im, 224)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the cropping size.
......@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True):
:type is_color: bool
"""
h, w = im.shape[:2]
h_start = (h - size) / 2
w_start = (w - size) / 2
h_start = (h - size) // 2
w_start = (w - size) // 2
h_end, w_end = h_start + size, w_start + size
if is_color:
im = im[h_start:h_end, w_start:w_end, :]
......@@ -243,11 +257,11 @@ def random_crop(im, size, is_color=True):
Randomly crop input image with size.
Example usage:
.. code-block:: python
im = random_crop(im, 224)
:param im: the input image with HWC layout.
:type im: ndarray
:param size: the cropping size.
......@@ -272,11 +286,11 @@ def left_right_flip(im, is_color=True):
Return the flipped image.
Example usage:
.. code-block:: python
im = left_right_flip(im)
:param im: input image with HWC layout or HW layout for gray image
:type im: ndarray
:param is_color: whether input image is color or not
......@@ -299,7 +313,7 @@ def simple_transform(im,
resizing, croping and flipping.
Example usage:
.. code-block:: python
im = simple_transform(im, 256, 224, True)
......@@ -314,7 +328,7 @@ def simple_transform(im,
:type is_train: bool
:param is_color: whether the image is color or not.
:type is_color: bool
:param mean: the mean values, which can be element-wise mean values or
:param mean: the mean values, which can be element-wise mean values or
mean values per channel.
:type mean: numpy array | list
"""
......@@ -332,7 +346,7 @@ def simple_transform(im,
im = im.astype('float32')
if mean is not None:
mean = np.array(mean, dtype=np.float32)
# mean value, may be one value per channel
# mean value, may be one value per channel
if mean.ndim == 1 and is_color:
mean = mean[:, np.newaxis, np.newaxis]
elif mean.ndim == 1:
......@@ -357,7 +371,7 @@ def load_and_transform(filename,
for the transform operations.
Example usage:
.. code-block:: python
im = load_and_transform('cat.jpg', 256, 224, True)
......@@ -372,7 +386,7 @@ def load_and_transform(filename,
:type is_train: bool
:param is_color: whether the image is color or not.
:type is_color: bool
:param mean: the mean values, which can be element-wise mean values or
:param mean: the mean values, which can be element-wise mean values or
mean values per channel.
:type mean: numpy array | list
"""
......
......@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
Besides, this module also provides API for building dictionary.
"""
from __future__ import print_function
import paddle.dataset.common
import collections
import tarfile
import re
import string
import six
__all__ = ['build_dict', 'train', 'test', 'convert']
......@@ -42,13 +45,14 @@ def tokenize(pattern):
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf = next(tarf)
tf = tarf.next()
while tf != None:
if bool(pattern.match(tf.name)):
# newline and punctuations removal and ad-hoc tokenization.
yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
None, string.punctuation).lower().split()
tf = next(tarf)
yield tarf.extractfile(tf).read().rstrip(six.b(
"\n\r")).translate(
None, six.b(string.punctuation)).lower().split()
tf = tarf.next()
def build_dict(pattern, cutoff):
......@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
word_freq[word] += 1
# Not sure if we should prune less-frequent words here.
word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary))
word_idx = dict(list(zip(words, list(range(len(words))))))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
word_idx['<unk>'] = len(words)
return word_idx
......
......@@ -14,13 +14,17 @@
"""
imikolov's simple dataset.
This module will download dataset from
This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
"""
from __future__ import print_function
import paddle.dataset.common
import collections
import tarfile
import six
__all__ = ['train', 'test', 'build_dict', 'convert']
......@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
word_freq = [
x for x in six.iteritems(word_freq) if x[1] > min_word_freq
]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, list(range(len(words))))))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
word_idx['<unk>'] = len(words)
return word_idx
......@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
l = ['<s>'] + l.strip().split() + ['<e>']
if len(l) >= n:
l = [word_idx.get(w, UNK) for w in l]
for i in range(n, len(l) + 1):
for i in six.moves.range(n, len(l) + 1):
yield tuple(l[i - n:i])
elif DataType.SEQ == data_type:
l = l.strip().split()
......
......@@ -17,10 +17,15 @@ MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse training set and test set into paddle reader creators.
"""
from __future__ import print_function
import paddle.dataset.common
import subprocess
import numpy
import platform
import tempfile
from six.moves import range
__all__ = ['train', 'test', 'convert']
URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
......@@ -45,23 +50,28 @@ def reader_creator(image_filename, label_filename, buffer_size):
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
m.stdout.read(16) # skip some magic bytes
tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
m = subprocess.Popen(
[zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
tmp_image_file.seek(16) # skip some magic bytes
l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
l.stdout.read(8) # skip some magic bytes
# Python3 will not take stdout as file
tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
l = subprocess.Popen(
[zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
tmp_label_file.seek(8) # skip some magic bytes
try: # reader could be break.
while True:
labels = numpy.fromfile(
l.stdout, 'ubyte', count=buffer_size).astype("int")
tmp_label_file, 'ubyte', count=buffer_size).astype("int")
if labels.size != buffer_size:
break # numpy.fromfile returns empty slice after EOF.
images = numpy.fromfile(
m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
(buffer_size, 28 * 28)).astype('float32')
tmp_image_file, 'ubyte', count=buffer_size * 28 *
28).reshape((buffer_size, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
......
......@@ -22,11 +22,15 @@ set and test set into paddle reader creators.
"""
from __future__ import print_function
import zipfile
import paddle.dataset.common
import re
import random
import functools
import six
import paddle.compat as cpt
__all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
......@@ -112,6 +116,7 @@ def __initialize_meta_info__():
categories_set = set()
with package.open('ml-1m/movies.dat') as movie_file:
for i, line in enumerate(movie_file):
line = cpt.to_text(line, encoding='latin')
movie_id, title, categories = line.strip().split('::')
categories = categories.split('|')
for c in categories:
......@@ -136,6 +141,7 @@ def __initialize_meta_info__():
USER_INFO = dict()
with package.open('ml-1m/users.dat') as user_file:
for line in user_file:
line = cpt.to_text(line, encoding='latin')
uid, gender, age, job, _ = line.strip().split("::")
USER_INFO[int(uid)] = UserInfo(
index=uid, gender=gender, age=age, job_id=job)
......@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
with zipfile.ZipFile(file=fn) as package:
with package.open('ml-1m/ratings.dat') as rating:
for line in rating:
line = cpt.to_text(line, encoding='latin')
if (rand.random() < test_ratio) == is_test:
uid, mov_id, rating, _ = line.strip().split("::")
uid = int(uid)
......@@ -187,7 +194,7 @@ def max_movie_id():
Get the maximum value of movie id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
def max_user_id():
......@@ -195,7 +202,7 @@ def max_user_id():
Get the maximum value of user id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, list(USER_INFO.values())).index
return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
def __max_job_id_impl__(a, b):
......@@ -210,7 +217,8 @@ def max_job_id():
Get the maximum value of job id.
"""
__initialize_meta_info__()
return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
return six.moves.reduce(__max_job_id_impl__,
list(USER_INFO.values())).job_id
def movie_categories():
......
......@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
"""
from __future__ import print_function
import os
import functools
import rarfile
......
......@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
TODO(yuyang18): Complete dataset.
"""
from __future__ import print_function
import six
import collections
from itertools import chain
......@@ -64,7 +67,7 @@ def get_word_dict():
for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field):
word_freq_dict[words] += 1
words_sort_list = list(word_freq_dict.items())
words_sort_list = six.iteritems(word_freq_dict)
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index))
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.cifar
import unittest
......
......@@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.common
import unittest
import tempfile
import glob
from six.moves import range
class TestCommon(unittest.TestCase):
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.flowers
import unittest
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.imdb
import unittest
import re
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.imikolov
import unittest
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.mnist
import unittest
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.mq2007
import unittest
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
......
......@@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import nltk
import paddle.dataset.sentiment as st
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.voc2012
import unittest
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.dataset.wmt16
import unittest
......
......@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse training set and test set into paddle reader creators.
"""
import os
from __future__ import print_function
import numpy as np
import six
import tempfile
import tarfile
import os
......@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
return
data = np.fromfile(filename, sep=' ')
data = data.reshape(data.shape[0] / feature_num, feature_num)
data = data.reshape(data.shape[0] // feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1])
for i in range(feature_num - 1):
for i in six.moves.range(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio)
UCI_TRAIN_DATA = data[:offset]
......@@ -137,7 +138,7 @@ def predict_reader():
It returns just one tuple data to do inference.
:return: one tuple data
:rtype: tuple
:rtype: tuple
"""
global UCI_TEST_DATA
load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
......
......@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
with segmentation has been increased from 7,062 to 9,993.
"""
from __future__ import print_function
import tarfile
import io
import numpy as np
......
......@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators.
"""
from __future__ import print_function
import six
import tarfile
import gzip
import paddle.dataset.common
import paddle.compat as cpt
__all__ = [
'train',
......@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
out_dict = dict()
for line_count, line in enumerate(fd):
if line_count < size:
out_dict[line.strip()] = line_count
out_dict[cpt.to_text(line.strip())] = line_count
else:
break
return out_dict
......@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
]
for name in names:
for line in f.extractfile(name):
line_split = line.strip().split('\t')
line_split = line.strip().split(six.b('\t'))
if len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
......@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True):
tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
if reverse:
src_dict = {v: k for k, v in list(src_dict.items())}
trg_dict = {v: k for k, v in list(trg_dict.items())}
src_dict = {v: k for k, v in six.iteritems(src_dict)}
trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
return src_dict, trg_dict
......
......@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
}
"""
from __future__ import print_function
import os
import six
import tarfile
import gzip
from collections import defaultdict
import paddle.dataset.common
import paddle.compat as cpt
__all__ = [
"train",
......@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
word_dict = defaultdict(int)
with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile("wmt16/train"):
line_split = line.strip().split("\t")
line_split = line.strip().split(six.b("\t"))
if len(line_split) != 2: continue
sen = line_split[0] if lang == "en" else line_split[1]
for w in sen.split():
......@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
for idx, word in enumerate(
sorted(
iter(list(word_dict.items())),
key=lambda x: x[1],
six.iteritems(word_dict), key=lambda x: x[1],
reverse=True)):
if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0]))
......@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
"wmt16/%s_%d.dict" % (lang, dict_size))
if not os.path.exists(dict_path) or (
len(open(dict_path, "r").readlines()) != dict_size):
len(open(dict_path, "rb").readlines()) != dict_size):
__build_dict(tar_file, dict_size, dict_path, lang)
word_dict = {}
with open(dict_path, "r") as fdict:
with open(dict_path, "rb") as fdict:
for idx, line in enumerate(fdict):
if reverse:
word_dict[idx] = line.strip()
word_dict[idx] = cpt.to_text(line.strip())
else:
word_dict[line.strip()] = idx
word_dict[cpt.to_text(line.strip())] = idx
return word_dict
......@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
with tarfile.open(tar_file, mode="r") as f:
for line in f.extractfile(file_name):
line_split = line.strip().split("\t")
line_split = line.strip().split(six.b("\t"))
if len(line_split) != 2:
continue
src_words = line_split[src_col].split()
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import warnings
"""
......
......@@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid import framework as framework
from . import core
import collections
import copy
import six
from .. import compat as cpt
from . import unique_name
__all__ = ['append_backward']
......@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
"""
op_desc = core.OpDesc()
op_desc.set_type(op_type)
for para, args in list(inputs.items()):
for para, args in six.iteritems(inputs):
op_desc.set_input(
para,
list(
map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
args)))
for para, args in list(outputs.items()):
for para, args in six.iteritems(outputs):
op_desc.set_output(
para,
list(
......@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
if op_role_attr_name not in attrs:
attrs[
op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
for name, val in list(attrs.items()):
for name, val in six.iteritems(attrs):
if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc)
else:
......@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
"""
Infer the data type of given grad variable
"""
grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
if block.desc.has_var_recursive(fwd_name):
fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
fwd_name = _strip_grad_suffix_(grad_var_name)
if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
grad_var.set_dtype(fwd_var.dtype())
else:
grad_var.set_dtype(core.VarDesc.VarType.FP32)
......@@ -102,8 +105,10 @@ def _some_in_set_(cands, s):
"""
if len(cands) == 0:
return False
for c in cands:
if c in s:
literal_set = cpt.to_text(s)
literal_cands = cpt.to_text(cands)
for c in literal_cands:
if c in literal_set:
return True
return False
......@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name):
e.g. x@GRAD ==> x
y@GRAD@RENAME@1 ==> y
"""
if isinstance(name, six.text_type):
name = name.encode()
pos = name.find(six.b(core.grad_var_suffix()))
name = cpt.to_text(name)
pos = name.find(core.grad_var_suffix())
return name[:pos] if pos != -1 else name
......@@ -125,9 +129,7 @@ def _append_grad_suffix_(name):
Append grad suffix to the given variable name
e.g. x ==> x@GRAD
"""
if isinstance(name, six.text_type):
name = name.encode()
return name + six.b(core.grad_var_suffix())
return cpt.to_text(name) + core.grad_var_suffix()
def _addup_repetitive_outputs_(op_descs):
......@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs):
op_desc.set_output(param_name, arg_names)
renamed_vars[var_name].append(new_name)
for var_name, inputs in list(renamed_vars.items()):
for var_name, inputs in six.iteritems(renamed_vars):
if len(inputs) > 1:
pending_sum_ops.append(
(_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
......@@ -243,7 +245,7 @@ from .proto import framework_pb2
def serialize_op_decs(op_desc):
protostr = op_desc.serialize_to_string()
proto = framework_pb2.OpDesc.FromString(str(protostr))
proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
return proto.__str__()
......@@ -364,7 +366,7 @@ def _append_backward_ops_(block,
# Getting op's corresponding grad_op
grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, no_grad_dict[block.idx], grad_sub_block_list)
op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var)
......@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
new_vars = set()
# create new gradient variables
for grad_var_name in op_desc.output_arg_names():
grad_var_name = grad_var_name.encode("ascii")
if block.desc.has_var_recursive(
grad_var_name) or grad_var_name == core.empty_var_name():
if block.desc.has_var_recursive(cpt.to_bytes(
grad_var_name)) or grad_var_name == core.empty_var_name():
continue
block.desc.var(grad_var_name)
block.desc.var(cpt.to_bytes(grad_var_name))
new_vars.add(grad_var_name)
if grad_var_name not in grad_to_var:
continue
......@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
op_desc.rename_output(name, new_name)
var_map[name] = new_name
for g, ng in list(var_map.items()):
for g, ng in six.iteritems(var_map):
if g in grad_to_var:
grad_to_var[ng] = grad_to_var[g]
grad_to_var.pop(g)
......@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
parameters = parameter_list
else:
params = program.global_block().all_parameters()
program.global_block().iter_parameters()
parameters = [param.name for param in params]
params_and_grads = []
for param in parameters:
if param not in grad_info_map:
if cpt.to_text(param) not in grad_info_map:
continue
grad_info = grad_info_map[param]
grad_block = grad_info[1]
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import copy
import six
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from .layers.control_flow import BlockGuard, equal
from .framework import Operator
from .layer_helper import LayerHelper, unique_name
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import decoder
from .decoder import *
from . import memory_usage_calc
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import beam_search_decoder
from .beam_search_decoder import *
......
......@@ -20,6 +20,8 @@ without using the low level API such as while ops.
This API is still under active development and may change drastically.
"""
from __future__ import print_function
import contextlib
import numpy as np
import six
......
......@@ -14,12 +14,16 @@
"""
This module privides a memory usage calculate function for user.
The purpose of this API is to allow users to estimate memory usage of
a program under a special batch size, then user can set appropriate
batch size to fully utilize a GPU.
a program under a special batch size, then user can set appropriate
batch size to fully utilize a GPU.
This API is still under active development and may change drastically.
"""
from __future__ import print_function
import six
from .. import core
from ..framework import Program, Variable
......@@ -45,15 +49,15 @@ def memory_usage(program, batch_size):
Args:
program(Program): The current Program.
batch_size(int): The current input data batch_size.
batch_size(int): The current input data batch_size.
Returns:
min_total_memory(float): the estimate memory usage lower bound.
max_total_memory(float): the estimate memory usage upper bound.
unit_str(string): the unit of estimate usage result.
Examples:
>>> import paddle.fluid as fluid
>>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
fluid.default_main_program(), batch_size=10)
......@@ -72,7 +76,7 @@ def memory_usage(program, batch_size):
# Get the var_name list of first block and calculate
total_memory = 0.0
for var in program.global_block().vars.itervalues():
for var in six.itervalues(program.global_block().vars):
data_count = 1
for x in var.shape:
if x == -1:
......@@ -81,10 +85,10 @@ def memory_usage(program, batch_size):
data_count *= x
var_memory = data_count * dtype_to_size[var.dtype]
if DEBUG:
print "%s memory usage: %d" % (var.name, var_memory)
print("%s memory usage: %d" % (var.name, var_memory))
total_memory += var_memory
if DEBUG:
print "total memory usage: %.2f" % (total_memory)
print("total memory usage: %.2f" % (total_memory))
# Convert appropriate unit
unit_str = "B"
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import core
import numpy
import os
......
......@@ -12,7 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import six
import re
from .graphviz import GraphPreviewGenerator
from .proto import framework_pb2
......@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
graph = GraphPreviewGenerator("some graph")
# collect parameters and args
protostr = block.desc.serialize_to_string()
desc = framework_pb2.BlockDesc.FromString(str(protostr))
desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
def need_highlight(name):
if highlights is None: return False
......
......@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
invoked in a new local scope.
"""
from __future__ import print_function
import paddle.fluid.core
import threading
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册