提交 812c5f60 编写于 作者: C chengduoZH

remove conflict

...@@ -8,15 +8,19 @@ function clock_to_seconds() { ...@@ -8,15 +8,19 @@ function clock_to_seconds() {
} }
function infer() { function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
thread=`nproc` trainers=`nproc`
if [ $thread -gt $bs ]; then if [ $trainers -gt $bs ]; then
thread=$bs trainers=$bs
fi fi
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log" log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/" models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then if [ ! -d $models_in ]; then
...@@ -28,7 +32,7 @@ function infer() { ...@@ -28,7 +32,7 @@ function infer() {
--config="${topology}.py" \ --config="${topology}.py" \
--use_mkldnn=False \ --use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$trainers \
--log_period=$log_period \ --log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \ --init_model_path=$models_in \
......
set -e set -e
function train() { function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY export OPENBLAS_NUM_THREADS=1
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
......
...@@ -252,6 +252,11 @@ first_seq ...@@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq .. autoclass:: paddle.v2.layer.first_seq
:noindex: :noindex:
sub_seq
---------
.. autoclass:: paddle.v2.layer.sub_seq
:noindex:
concat concat
------ ------
.. autoclass:: paddle.v2.layer.concat .. autoclass:: paddle.v2.layer.concat
......
...@@ -68,12 +68,6 @@ scale ...@@ -68,12 +68,6 @@ scale
:noindex: :noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose transpose
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose .. autofunction:: paddle.v2.fluid.layers.transpose
......
...@@ -79,7 +79,7 @@ class Optimizer(object): ...@@ -79,7 +79,7 @@ class Optimizer(object):
def minimize(self, loss, parameter_list): def minimize(self, loss, parameter_list):
"""Add operations to minimize `loss` by updating `parameter_list`. """Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = self.create_backward_pass(loss, parameter_list) params_grads = self.create_backward_pass(loss, parameter_list)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
获取PaddlePaddle的Docker镜像 获取PaddlePaddle的Docker镜像
------------------------------ ------------------------------
执行下面的命令获取最新的PaddlePaddle Docker镜像 执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
.. code-block:: bash .. code-block:: bash
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddle.org/paddle
下载GPU版本的Docker镜像: 下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
.. code-block:: bash .. code-block:: bash
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
.. _docker_run: .. _docker_run:
在Docker中执行PaddlePaddle训练程序 在Docker中执行PaddlePaddle训练程序
------------------------------ ----------------------------------
假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
...@@ -82,7 +82,7 @@ ...@@ -82,7 +82,7 @@
.. _docker_run_book: .. _docker_run_book:
使用Docker启动PaddlePaddle Book教程 使用Docker启动PaddlePaddle Book教程
------------------------------ -----------------------------------
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
......
...@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps. ...@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
Pull PaddlePaddle Docker Image Pull PaddlePaddle Docker Image
------------------------------ ------------------------------
Run the following command to download the latest Docker images: Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
.. code-block:: bash .. code-block:: bash
...@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror: ...@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddle.org/paddle
Download GPU version images: Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
.. code-block:: bash .. code-block:: bash
...@@ -58,7 +58,7 @@ and run: ...@@ -58,7 +58,7 @@ and run:
.. _docker_run: .. _docker_run:
Launch your training program in Docker Launch your training program in Docker
------------------------------ --------------------------------------
Assume that you have already written a PaddlePaddle program Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to named :code:`train.py` under directory :code:`/home/work` (refer to
......
...@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
------------------------------ ------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行: 如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash .. code-block:: bash
......
...@@ -12,14 +12,14 @@ Install Using pip ...@@ -12,14 +12,14 @@ Install Using pip
------------------------------ ------------------------------
Run the following command to install PaddlePaddle on the current Run the following command to install PaddlePaddle on the current
machine, it will also download requirements. machine, it will also download requirements, the version is cpu_avx_openblas.
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
If you wish to install GPU version, just run: If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
.. code-block:: bash .. code-block:: bash
......
...@@ -7,13 +7,13 @@ ...@@ -7,13 +7,13 @@
++++++++ ++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装: 执行下面的命令完成快速安装,版本为cpu_avx_openblas
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行: 如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash .. code-block:: bash
......
...@@ -8,13 +8,13 @@ Quick Install ...@@ -8,13 +8,13 @@ Quick Install
You can use pip to install PaddlePaddle with a single command, supports You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install: Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
If you need to install GPU version, run: If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash .. code-block:: bash
......
...@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) ...@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) if (WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif ()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) if (WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/data_transform.h" #include "paddle/framework/data_transform.h"
#include "paddle/framework/lod_tensor.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -27,9 +27,8 @@ limitations under the License. */ ...@@ -27,9 +27,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using DataTransformFN = using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
std::function<void(const std::vector<platform::DeviceContext*> ctx, const Variable& in, Variable* out)>;
const Variable& in, Variable* out)>;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>; using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
struct KernelTypePairHash { struct KernelTypePairHash {
...@@ -47,7 +46,7 @@ struct KernelTypePairHash { ...@@ -47,7 +46,7 @@ struct KernelTypePairHash {
}; };
using DataTransformMap = using DataTransformMap =
std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>; std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
class DataTransformFnMap { class DataTransformFnMap {
public: public:
...@@ -58,25 +57,25 @@ class DataTransformFnMap { ...@@ -58,25 +57,25 @@ class DataTransformFnMap {
} }
void Insert(const OpKernelType& left, const OpKernelType& right, void Insert(const OpKernelType& left, const OpKernelType& right,
const DataTransformFN& data_tranform_fn) { const DataTransformFn& data_tranform_fn) {
Insert(std::make_pair(left, right), data_tranform_fn); Insert(std::make_pair(left, right), data_tranform_fn);
} }
void Insert(const KernelTypePair& kernel_type_pair, void Insert(const KernelTypePair& kernel_type_pair,
const DataTransformFN& data_tranform_fn) { const DataTransformFn& data_tranform_fn) {
PADDLE_ENFORCE(!Has(kernel_type_pair), PADDLE_ENFORCE(!Has(kernel_type_pair),
"KernelTypePair %s has been registered", ""); "KernelTypePair %s has been registered", "");
map_.insert({kernel_type_pair, data_tranform_fn}); map_.insert({kernel_type_pair, data_tranform_fn});
} }
const DataTransformFN& Get(const KernelTypePair& key_pair) const { const DataTransformFn& Get(const KernelTypePair& key_pair) const {
auto data_transformer = GetNullable(key_pair); auto data_transformer = GetNullable(key_pair);
PADDLE_ENFORCE_NOT_NULL(data_transformer, PADDLE_ENFORCE_NOT_NULL(data_transformer,
"DataTransformFN should not be NULL"); "DataTransformFn should not be NULL");
return *data_transformer; return *data_transformer;
} }
const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const { const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
auto it = map_.find(key_pair); auto it = map_.find(key_pair);
if (it == map_.end()) { if (it == map_.end()) {
return nullptr; return nullptr;
......
...@@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <array>
#include <vector>
#include "paddle/framework/data_transform.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using namespace platform; using namespace platform;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std::array<proto::DataType, 2> kDataType = {
{proto::DataType::FP32, proto::DataType::FP64}};
std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
std::array<DataLayout, 2> kDataLayout = {
{DataLayout::kNHWC, DataLayout::kNCHW}};
std::array<LibraryType, 2> kLibraryType = {
{LibraryType::kPlain, LibraryType::kMKLDNN}};
OpKernelType GenFromBit(const std::vector<bool> bits) {
return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
kLibraryType[bits[3]]);
}
int test_value = 0; int test_value = 0;
OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW, auto kernel0 = GenFromBit({0, 0, 0, 0});
LibraryType::kCUDNN); auto kernel1 = GenFromBit({0, 0, 0, 1});
OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0), auto kernel2 = GenFromBit({0, 0, 1, 0});
DataLayout::kNCHW, LibraryType::kCUDNN); auto kernel3 = GenFromBit({0, 0, 1, 1});
OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
void type1_to_type2(std::vector<platform::DeviceContext*> ctx, void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
const Variable& in, Variable* out) { Variable* out) {
test_value++; test_value++;
} }
void type2_to_type3(std::vector<platform::DeviceContext*> ctx, void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
const Variable& in, Variable* out) { Variable* out) {
test_value--; test_value--;
} }
void type1_to_type3(std::vector<platform::DeviceContext*> ctx, void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
const Variable& in, Variable* out) { Variable* out) {
test_value += 2; test_value += 2;
} }
...@@ -49,12 +74,9 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx, ...@@ -49,12 +74,9 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
namespace frw = paddle::framework; namespace frw = paddle::framework;
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2, REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
frw::type1_to_type2); REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3, REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
frw::type2_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
frw::type1_to_type3);
TEST(DataTransform, Register) { TEST(DataTransform, Register) {
using namespace paddle::framework; using namespace paddle::framework;
...@@ -62,17 +84,16 @@ TEST(DataTransform, Register) { ...@@ -62,17 +84,16 @@ TEST(DataTransform, Register) {
auto& instance = DataTransformFnMap::Instance(); auto& instance = DataTransformFnMap::Instance();
ASSERT_EQ(instance.Map().size(), 3UL); ASSERT_EQ(instance.Map().size(), 3UL);
std::vector<DeviceContext*> ctx; DeviceContext* ctx = nullptr;
paddle::framework::Variable in; paddle::framework::Variable in;
paddle::framework::Variable out; paddle::framework::Variable out;
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in, instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
&out);
ASSERT_EQ(test_value, 1); ASSERT_EQ(test_value, 1);
instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
&out); instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
ASSERT_EQ(test_value, 0); ASSERT_EQ(test_value, 0);
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
&out); instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
ASSERT_EQ(test_value, 2); ASSERT_EQ(test_value, 2);
} }
...@@ -14,18 +14,17 @@ limitations under the License. */ ...@@ -14,18 +14,17 @@ limitations under the License. */
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <set> #include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { ...@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
} }
} }
static void CheckTensorNANOrInf(const std::string& name,
const framework::Tensor& tensor) {
if (tensor.memory_size() == 0) {
return;
}
if (tensor.type().hash_code() != typeid(float).hash_code() &&
tensor.type().hash_code() != typeid(double).hash_code()) {
return;
}
PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail): // TODO(tonyyang-svail):
...@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString(); VLOG(3) << op->DebugString();
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
} }
if (create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
} }
......
...@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified CPU device, create CPU by Default."; LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
} }
platform::DeviceContextPool::Create(places); platform::DeviceContextPool::Init(places);
return true; return true;
} }
......
...@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { ...@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor, void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) { const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream { // the 1st field, uint32_t version for LoDTensor
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0; constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version)); os.write(reinterpret_cast<const char *>(&version), sizeof(version));
} }
{ // the 2nd field, tensor description {
// int32_t size // the 2st field, LoD information
// void* protobuf message // uint64_t lod_level
proto::TensorDesc desc; // uint64_t lod_level_1 size in byte.
desc.set_data_type(framework::ToDataType(tensor.type())); // int* lod_level_1 data
auto dims = framework::vectorize(tensor.dims()); // ...
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod(); auto lod = tensor.lod();
uint64_t size = lod.size(); uint64_t size = lod.size();
os.write(reinterpret_cast<const char *>(&size), sizeof(size)); os.write(reinterpret_cast<const char *>(&size), sizeof(size));
...@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, ...@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
static_cast<std::streamsize>(size)); static_cast<std::streamsize>(size));
} }
} }
// the 3st field, Tensor
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
} }
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
uint32_t version; {
is.read(reinterpret_cast<char *>(&version), sizeof(version)); // the 1st field, unit32_t version for SelectedRows
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); uint32_t version;
proto::TensorDesc desc; is.read(reinterpret_cast<char *>(&version), sizeof(version));
{ // int32_t size PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
// proto buffer }
int32_t size; {
is.read(reinterpret_cast<char *>(&size), sizeof(size)); // the 2st field, LoD information
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level; uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level)); is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod(); auto &lod = *tensor->mutable_lod();
...@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { ...@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
lod[i] = tmp; lod[i] = tmp;
} }
} }
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor));
} }
} // namespace framework } // namespace framework
......
...@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) { ...@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>()); EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
} }
TEST_F(LoDTensorTester, SerializeAndDeserialize) {
LoDTensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
std::ostringstream oss;
SerializeToStream(oss, lod_tensor_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < kLodTensorSize; ++i) {
EXPECT_EQ(dst_ptr[i], i);
}
EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, ...@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
need_update_ = true; need_update_ = true;
} }
void OpDesc::CopyFrom(const OpDesc &op_desc) {
desc_.set_type(op_desc.Type());
inputs_ = op_desc.inputs_;
outputs_ = op_desc.outputs_;
attrs_ = op_desc.attrs_;
need_update_ = true;
}
OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
: desc_(desc), need_update_(false) { : desc_(desc), need_update_(false) {
// restore inputs_ // restore inputs_
......
...@@ -35,6 +35,8 @@ class OpDesc { ...@@ -35,6 +35,8 @@ class OpDesc {
OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
void CopyFrom(const OpDesc &op_desc);
proto::OpDesc *Proto(); proto::OpDesc *Proto();
std::string Type() const { return desc_.type(); } std::string Type() const { return desc_.type(); }
......
...@@ -68,6 +68,8 @@ struct OpKernelType { ...@@ -68,6 +68,8 @@ struct OpKernelType {
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_; library_type_ == o.library_type_;
} }
bool operator!=(const OpKernelType& o) const { return !(*this == o); }
}; };
inline std::ostream& operator<<(std::ostream& os, inline std::ostream& operator<<(std::ostream& os,
...@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os, ...@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
return os; return os;
} }
inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
std::ostringstream stream;
stream << kernel_key;
return stream.str();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) { ...@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
std::ostringstream stream;
stream << op_kernel_type;
ASSERT_EQ( ASSERT_EQ(
stream.str(), paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
} }
......
...@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
const Scope& scope_; const Scope& scope_;
}; };
const platform::DeviceContext* GetDeviceContext(
framework::KernelTypePair& kernel_pair) {
auto& actual_kernel_key = kernel_pair.first;
auto& expected_kernel_key = kernel_pair.second;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(actual_kernel_key.place_) &&
platform::is_cpu_place(expected_kernel_key.place_)) {
return pool.Get(actual_kernel_key.place_);
} else if (platform::is_cpu_place(actual_kernel_key.place_) &&
platform::is_gpu_place(expected_kernel_key.place_)) {
return pool.Get(expected_kernel_key.place_);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Borrow(place); auto dev_ctx = pool.Get(place);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
...@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
} }
if (actual_kernel_key == expected_kernel_key) { if (actual_kernel_key == expected_kernel_key) {
kernel_iter->second->Compute(ctx); PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed.");
} else { } else {
Scope& op_scope = scope.NewScope(); auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
auto input_vars = this->InputVars(); const DataTransformFn* trans_fun =
for (auto var_name : input_vars) { DataTransformFnMap::Instance().GetNullable(kernel_pair);
op_scope.Var(var_name); if (trans_fun) {
} auto input_vars = this->InputVars();
// TODO(qijun) filter the input vars that do not need to be transformed
// TODO(qijun) get appropriate DeviceContext from DeviceContext pool
platform::DeviceContext* trans_dev_ctx = nullptr; // filter vars that has been transformed
std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx}; std::vector<std::string> need_trans;
for (auto var_name : input_vars) {
auto var_name_trans =
var_name + framework::KernelTypeToString(expected_kernel_key);
if (!scope.FindVar(var_name_trans)) {
const_cast<Scope&>(scope).Var(var_name_trans);
need_trans.push_back(var_name);
}
}
// TODO(qijun) get appropriate DataTransformFN from global map if (!need_trans.empty()) {
framework::DataTransformFN trans_fun = nullptr; auto trans_dev_ctx = GetDeviceContext(kernel_pair);
// Wait for transform starting // Wait for transform starting
dev_ctx->Wait(); dev_ctx->Wait();
for (auto var_name : input_vars) { for (auto var_name : need_trans) {
trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)), (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
op_scope.FindVar(var_name)); scope.FindVar(var_name + framework::KernelTypeToString(
} expected_kernel_key)));
// Wait for data transform finishing }
for (auto ctx : trans_dev_ctx_vec) { // Wait for data transform finishing
ctx->Wait(); trans_dev_ctx->Wait();
}
} }
// Create a new ExecutionContext
ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
kernel_iter->second->Compute(op_ctx);
} }
kernel_iter->second->Compute(ctx);
} }
OpKernelType OperatorWithKernel::GetActualKernelType( OpKernelType OperatorWithKernel::GetActualKernelType(
......
...@@ -12,5 +12,58 @@ limitations under the License. */ ...@@ -12,5 +12,58 @@ limitations under the License. */
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
namespace paddle { namespace paddle {
namespace framework {} // namespace framework namespace framework {
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{
// the 2st field, rows information
auto& rows = selected_rows.rows();
uint64_t size = rows.size();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
for (uint64_t i = 0; i < size; ++i) {
os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
}
}
{
// the 3st field, the height of SelectedRows
int64_t height = selected_rows.height();
os.write(reinterpret_cast<const char*>(&height), sizeof(height));
}
// the 4st field, Tensor data
SerializeToStream(os, selected_rows.value(), dev_ctx);
}
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
auto tensor = *selected_rows->mutable_value();
{
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
}
{
// the 2st field, rows information
uint64_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
auto& rows = *selected_rows->mutable_rows();
rows.resize(size);
for (uint64_t i = 0; i < size; ++i) {
is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
}
}
{
// the 3st field, the height of the SelectedRows
int64_t height;
is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
selected_rows->set_height(height);
}
// the 4st field, tensor which contains the data
DeserializeFromStream(is, &tensor);
}
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -59,5 +59,14 @@ class SelectedRows { ...@@ -59,5 +59,14 @@ class SelectedRows {
int64_t height_; int64_t height_;
}; };
/*
* Serialize/Desiralize SelectedRows to std::ostream
* You can pass ofstream or ostringstream to serilize to file
* or to a in memory string. GPU tensor will be copied to CPU.
*/
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) { ...@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
} }
TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
SelectedRows dst_tensor;
platform::CPUDeviceContext cpu_ctx(place_);
std::ostringstream oss;
SerializeToStream(oss, *selected_rows_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -178,7 +178,7 @@ class Tensor { ...@@ -178,7 +178,7 @@ class Tensor {
DDim dims_; DDim dims_;
/** /**
* @brief the layout of memory block, default is NCHW. * @brief the layout of memory block, default is NHWC.
* *
* @note the memory allocation order, describe how weight/data is stored * @note the memory allocation order, describe how weight/data is stored
* For example, in 4-D Tensor(rank=4), there are three commonly * For example, in 4-D Tensor(rank=4), there are three commonly
......
...@@ -15,12 +15,13 @@ ...@@ -15,12 +15,13 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
namespace framework = paddle::framework;
namespace platform = paddle::platform;
TEST(Tensor, Dims) { TEST(Tensor, Dims) {
using namespace paddle::framework; framework::Tensor tt;
using namespace paddle::platform;
Tensor tt;
tt.Resize({2, 3, 4}); tt.Resize({2, 3, 4});
DDim dims = tt.dims(); framework::DDim dims = tt.dims();
ASSERT_EQ(arity(dims), 3); ASSERT_EQ(arity(dims), 3);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
EXPECT_EQ(i + 2, dims[i]); EXPECT_EQ(i + 2, dims[i]);
...@@ -28,12 +29,12 @@ TEST(Tensor, Dims) { ...@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
} }
TEST(Tensor, DataAssert) { TEST(Tensor, DataAssert) {
paddle::framework::Tensor src_tensor; framework::Tensor src_tensor;
bool caught = false; bool caught = false;
try { try {
src_tensor.data<double>(); src_tensor.data<double>();
} catch (paddle::platform::EnforceNotMet err) { } catch (platform::EnforceNotMet err) {
caught = true; caught = true;
std::string msg = std::string msg =
"holder_ should not be null\nTensor holds no memory. Call " "holder_ should not be null\nTensor holds no memory. Call "
...@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) { ...@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
because Memory::Alloc() and Memory::Free() have not been ready. because Memory::Alloc() and Memory::Free() have not been ready.
*/ */
TEST(Tensor, MutableData) { TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CPUPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
platform::CPUPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CUDAPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
platform::CUDAPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#endif #endif
} }
TEST(Tensor, ShareDataWith) { TEST(Tensor, ShareDataWith) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
Tensor dst_tensor; framework::Tensor dst_tensor;
// Try to share data form uninitialized tensor // Try to share data form uninitialized tensor
bool caught = false; bool caught = false;
try { try {
...@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) { ...@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
} }
ASSERT_TRUE(caught); ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CPUPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
Tensor dst_tensor; framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace()); src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CUDAPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
...@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) { ...@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
} }
TEST(Tensor, Slice) { TEST(Tensor, Slice) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
Tensor slice_tensor = src_tensor.Slice(1, 3); platform::CPUPlace());
DDim slice_dims = slice_tensor.dims(); framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3); ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2); EXPECT_EQ(slice_dims[0], 2);
EXPECT_EQ(slice_dims[1], 3); EXPECT_EQ(slice_dims[1], 3);
...@@ -153,11 +159,12 @@ TEST(Tensor, Slice) { ...@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<int>()); reinterpret_cast<uintptr_t>(src_tensor.data<int>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace())); src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<int>()); reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address =
slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace())); reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
slice_tensor.dims(), platform::CPUPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
...@@ -165,22 +172,25 @@ TEST(Tensor, Slice) { ...@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace()); src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
Tensor slice_tensor = src_tensor.Slice(2, 6); platform::CUDAPlace());
DDim slice_dims = slice_tensor.dims(); framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2); ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4); EXPECT_EQ(slice_dims[0], 4);
EXPECT_EQ(slice_dims[1], 9); EXPECT_EQ(slice_dims[1], 9);
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>()); reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address =
src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace())); reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>()); reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address =
slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace())); reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
...@@ -189,23 +199,19 @@ TEST(Tensor, Slice) { ...@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
} }
TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, ReshapeToMatrix) {
using namespace paddle::framework; framework::Tensor src;
using namespace paddle::platform; int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
Tensor src;
int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i; src_ptr[i] = i;
} }
Tensor res = ReshapeToMatrix(src, 2); framework::Tensor res = framework::ReshapeToMatrix(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9); ASSERT_EQ(res.dims()[1], 4 * 9);
} }
TEST(Tensor, Layout) { TEST(Tensor, Layout) {
using namespace paddle::framework; framework::Tensor src;
using namespace paddle::platform; ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
Tensor src; src.set_layout(framework::DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), DataLayout::kNHWC); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
src.set_layout(DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
} }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/tensor_util.h"
namespace paddle {
namespace framework {
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
const Tensor& tensor_;
const DevCtx& ctx_;
Tensor* out_;
AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
Tensor* out)
: predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
template <typename T>
void operator()() const {
auto t = EigenVector<T>::Flatten(tensor_);
auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
o.device(*ctx_.eigen_device()) = predicate_(t).any();
}
};
template <typename Predicate, typename DevCtx>
inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
const DevCtx& ctx, framework::Tensor* out) {
VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
predicate, tensor, ctx, out));
}
template <typename Predicate>
struct AnyVisitor : public boost::static_visitor<bool> {
const framework::Tensor& tensor_;
Predicate predicate_;
AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
template <typename Place>
bool operator()(const Place& place) const {
framework::Tensor out;
out.Resize({1});
out.mutable_data<bool>(place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
AnyImpl(predicate_, tensor_, *ctx, &out);
return this->GetResult(out, place);
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPlace& gpu) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
CopyFrom(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
}
};
template <typename Predicate>
inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
AnyVisitor<Predicate> visitor(tensor, predicate);
auto place = tensor.place();
return platform::VisitPlace(place, visitor);
}
struct HasNANPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isnan();
}
};
bool HasNAN(const framework::Tensor& tensor) {
HasNANPredicate predicate;
return Any(tensor, predicate);
}
struct HasInfPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isinf();
}
};
bool HasInf(const framework::Tensor& tensor) {
HasInfPredicate predicate;
return Any(tensor, predicate);
}
} // namespace framework
} // namespace paddle
./tensor_util.cc
\ No newline at end of file
...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size); src_ptr, size);
} }
// Returns true if a tensor contains NAN, i.e., Not A Number.
bool HasNAN(const framework::Tensor& tensor);
// Returns true if a tensor contains Inf, i.e., Infinity.
bool HasInf(const framework::Tensor& tensor);
inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto* pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
static_cast<std::streamsize>(size));
}
}
}
inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
platform::Place cpu = platform::CPUPlace();
// TODO(Yancey1989): use VisiterDataType instead of DataType switch
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char*>(buf), tensor->memory_size());
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <cmath>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) { ...@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
#endif #endif
} }
TEST(HasNAN, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
float* buf = src.mutable_data<float>({3}, CPUPlace());
buf[0] = 0.0;
buf[1] = NAN;
buf[2] = 0.0;
ASSERT_TRUE(HasNAN(src));
}
TEST(HasInf, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
double* buf = src.mutable_data<double>({3}, CPUPlace());
buf[0] = 1.0;
buf[1] = INFINITY;
buf[2] = 0.0;
ASSERT_TRUE(HasInf(src));
}
TEST(Tensor, SerializeAndDeserialize) {
framework::Tensor src_tensor;
int array[6] = {1, 2, 3, 4, 5, 6};
src_tensor.Resize({2, 3});
int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
src_ptr[i] = array[i];
}
{
framework::Tensor dst_tensor;
auto place = new platform::CPUPlace();
platform::CPUDeviceContext cpu_ctx(*place);
std::ostringstream oss;
SerializeToStream(oss, src_tensor, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 5; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
delete place;
}
#ifdef PADDLE_WITH_CUDA
{
Tensor gpu_tensor;
gpu_tensor.Resize({2, 3});
Tensor dst_tensor;
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
std::ostringstream oss;
SerializeToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
delete gpu_place;
}
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
static __global__ void FillNAN(float* buf) {
buf[0] = 0.0;
buf[1] = 0.1;
buf[2] = NAN;
}
static __global__ void FillInf(float* buf) {
buf[0] = 0.0;
buf[1] = INFINITY;
buf[2] = 0.5;
}
TEST(HasNAN, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
float* buf = tensor.mutable_data<float>({3}, gpu);
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasNAN(tensor));
}
TEST(HasInf, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
float* buf = tensor.mutable_data<float>({3}, gpu);
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasInf(tensor));
}
} // namespace framework
} // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <condition_variable> #include <condition_variable>
#include <functional> #include <functional>
#include <future>
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <thread> #include <thread>
...@@ -25,10 +26,11 @@ limitations under the License. */ ...@@ -25,10 +26,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
typedef std::function<void()> Task;
class ThreadPool { class ThreadPool {
public: public:
typedef std::packaged_task<void()> Task;
typedef std::function<void()> Fun;
/** /**
* @brief Get a instance of threadpool, the thread number will * @brief Get a instance of threadpool, the thread number will
* be specified as the number of hardware thread contexts * be specified as the number of hardware thread contexts
...@@ -61,13 +63,18 @@ class ThreadPool { ...@@ -61,13 +63,18 @@ class ThreadPool {
/** /**
* @brief Push a function to the queue, and will be scheduled and * @brief Push a function to the queue, and will be scheduled and
* executed if a thread is available. * executed if a thread is available.
* @param[in] Task will be pushed to the task queue. * @param[in] Task, will be pushed to the task queue.
* @return std::future<void>, we could wait for the task finished by
* f.wait().
*/ */
void Run(const Task& fn) { std::future<void> Run(const Fun& fn) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
tasks_.push(fn); Task task(std::bind(fn));
std::future<void> f = task.get_future();
tasks_.push(std::move(task));
lock.unlock(); lock.unlock();
scheduled_.notify_one(); scheduled_.notify_one();
return f;
} }
/** /**
...@@ -110,7 +117,7 @@ class ThreadPool { ...@@ -110,7 +117,7 @@ class ThreadPool {
break; break;
} }
// pop a task from the task queue // pop a task from the task queue
auto task = tasks_.front(); auto task = std::move(tasks_.front());
tasks_.pop(); tasks_.pop();
--available_; --available_;
......
...@@ -20,16 +20,21 @@ limitations under the License. */ ...@@ -20,16 +20,21 @@ limitations under the License. */
namespace framework = paddle::framework; namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) { void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) { for (int i = 0; i < cnt; ++i) {
pool->Run([&sum]() { sum.fetch_add(1); }); auto f = pool->Run([&sum]() { sum.fetch_add(1); });
fs.push_back(std::move(f));
}
for (auto& f : fs) {
f.wait();
} }
} }
TEST(ThreadPool, ConcurrentInit) { TEST(ThreadPool, ConcurrentInit) {
framework::ThreadPool* pool; framework::ThreadPool* pool;
int concurrent_cnt = 50; int n = 50;
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int i = 0; i < concurrent_cnt; ++i) { for (int i = 0; i < n; ++i) {
std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
threads.push_back(std::move(t)); threads.push_back(std::move(t));
} }
...@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) { ...@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
} }
} }
TEST(ThreadPool, ConcurrentStart) { TEST(ThreadPool, ConcurrentRun) {
framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
std::atomic<int> sum(0); std::atomic<int> sum(0);
std::vector<std::thread> threads; std::vector<std::thread> threads;
int concurrent_cnt = 50; int n = 50;
// sum = (n * (n + 1)) / 2 // sum = (n * (n + 1)) / 2
for (int i = 1; i <= concurrent_cnt; ++i) { for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, std::ref(sum), i); std::thread t(do_sum, pool, std::ref(sum), i);
threads.push_back(std::move(t)); threads.push_back(std::move(t));
} }
...@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) { ...@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
t.join(); t.join();
} }
pool->Wait(); pool->Wait();
EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); EXPECT_EQ(sum, ((n + 1) * n) / 2);
} }
...@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const { ...@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarDesc::LOD_TENSOR_ARRAY: case proto::VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor(); return desc_.tensor_array().tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
} }
} }
......
...@@ -126,14 +126,165 @@ public: ...@@ -126,14 +126,165 @@ public:
inputData += inputChannels * inputHeight * inputWidth; inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth; outputData += outputChannels * outputHeight * outputWidth;
} }
}
};
#ifdef PADDLE_MOBILE_INFERENCE #ifdef PADDLE_MOBILE_INFERENCE
if (Device == DEVICE_TYPE_CPU) {
memory_.reset(); /*
* \brief Forward calculation of convolution, optimized for mobile.
*/
template <DeviceType Device>
class GemmConvMobileFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
// TODO(hedaoyuan): Need to define some index macros,
// to avoid useing 0 and 1.
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
real beta;
if (outputs[0].getArgType() == ADD_TO) {
beta = 1.0;
} else {
beta = 0.0;
} }
#endif
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>();
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
size_t colWidth = outputHeight * outputWidth;
// Max col matrix height 256, Max col matrix width 1024
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
filterHeight,
filterWidth,
outputHeight,
outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf());
}
Im2ColMobileFunctor<real> im2col;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth;
int kStride = colHeight;
for (size_t i = 0; i < batchSize; i++) {
for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight;
colHeightStart += stepColHeight) {
for (size_t colWidthStart = 0; colWidthStart < colWidth;
colWidthStart += stepColWidth) {
int N = std::min(colWidth - colWidthStart, stepColWidth);
int K = std::min(colHeight - colHeightStart, stepColHeight);
// im2col
im2col(inputData + g * inputOffset,
imShape,
colData,
colShape,
strideH(),
strideW(),
paddingH(),
paddingW(),
dilationH(),
dilationW(),
colHeightStart,
K,
colWidthStart,
N);
// gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute(
false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset + colHeightStart,
kStride,
colData,
N,
beta_,
outputData + g * outputOffset + colWidthStart,
nStride);
}
beta_ = 1.0;
}
} else {
int M = outputChannels / groups_;
int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth;
BlasGemm<Device, real>::compute(false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
inputData + g * inputOffset,
N,
beta,
outputData + g * outputOffset,
N);
}
}
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
memory_.reset();
} }
}; };
#endif
/* /*
* \brief Backward input calculation of convolution. * \brief Backward input calculation of convolution.
*/ */
...@@ -348,7 +499,11 @@ public: ...@@ -348,7 +499,11 @@ public:
} }
}; };
#ifdef PADDLE_MOBILE_INFERENCE
REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
#else
REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
#endif
REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -98,4 +98,54 @@ public: ...@@ -98,4 +98,54 @@ public:
int dilationWidth = 1); int dilationWidth = 1);
}; };
template <class T>
class Im2ColMobileFunctor {
public:
void operator()(const T* imData,
const TensorShape& imShape,
T* colData,
const TensorShape& colShape,
int strideHeight,
int strideWidth,
int paddingHeight,
int paddingWidth,
int dilationHeight,
int dilationWidth,
int colHeightStart,
int colHeightSize,
int colWidthStart,
int colWidthSize) {
int inputHeight = imShape[1];
int inputWidth = imShape[2];
int filterHeight = colShape[1];
int filterWidth = colShape[2];
int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) {
int wOffset = (colHeightStart + colh) % filterWidth;
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
for (int colw = 0; colw < colWidthSize; colw++) {
int h = (colWidthStart + colw) / outputWidth;
int w = (colWidthStart + colw) % outputWidth;
int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 ||
(imColIdx - paddingWidth) >= inputWidth) {
colData[colh * colWidthSize + colw] = static_cast<T>(0);
} else {
imRowIdx += c_im * inputHeight - paddingHeight;
imColIdx -= paddingWidth;
colData[colh * colWidthSize + colw] =
imData[imRowIdx * inputWidth + imColIdx];
}
}
}
}
};
} // namespace paddle } // namespace paddle
...@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); } ...@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
#endif #endif
template <class T>
void TestIm2ColMobileFunctor() {
for (size_t channels : {32}) {
for (size_t inputHeight : {33, 100}) {
for (size_t inputWidth : {32, 96}) {
for (size_t filterHeight : {5}) {
for (size_t filterWidth : {7}) {
for (size_t stride : {2}) {
for (size_t padding : {1}) {
for (size_t dilation : {1, 3}) {
size_t filterSizeH = (filterHeight - 1) * dilation + 1;
size_t filterSizeW = (filterWidth - 1) * dilation + 1;
if (inputHeight + 2 * padding < filterSizeH ||
inputWidth + 2 * padding < filterSizeW)
break;
if (padding >= filterSizeH || padding >= filterSizeW) break;
size_t outputHeight =
(inputHeight - filterSizeH + 2 * padding) / stride + 1;
size_t outputWidth =
(inputWidth - filterSizeW + 2 * padding) / stride + 1;
TensorShape imShape =
TensorShape({channels, inputHeight, inputWidth});
TensorShape colShape1 = TensorShape({channels,
filterHeight,
filterWidth,
outputHeight,
outputWidth});
size_t height = channels * filterHeight * filterWidth;
size_t width = outputHeight * outputWidth;
VectorPtr input1 =
Vector::create(imShape.getElements(), false);
VectorPtr input2 =
Vector::create(imShape.getElements(), false);
MatrixPtr output1 =
Matrix::create(height, width, false, false);
MatrixPtr output2 =
Matrix::create(height, width, false, false);
input1->uniform(0.001, 1);
input2->copyFrom(*input1);
Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
Im2ColMobileFunctor<T> im2Col2;
im2Col1(input1->getData(),
imShape,
output1->getData(),
colShape1,
stride,
stride,
padding,
padding,
dilation,
dilation);
im2Col2(input2->getData(),
imShape,
output2->getData(),
colShape1,
stride,
stride,
padding,
padding,
dilation,
dilation,
0,
height,
0,
width);
autotest::TensorCheckEqual(*output1, *output2);
}
}
}
}
}
}
}
}
}
TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
} // namespace paddle } // namespace paddle
...@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap, ...@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
} }
/* the size of inputs for norm-layer is 1 */ /* the size of inputs for norm-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1UL); CHECK_EQ(config_.inputs_size(), 1);
const NormConfig& conf = config_.inputs(0).norm_conf(); const NormConfig& conf = config_.inputs(0).norm_conf();
localSize_ = conf.size(); localSize_ = conf.size();
alpha_ = conf.scale(); alpha_ = conf.scale();
......
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h) set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
function(op_library TARGET) function(op_library TARGET)
...@@ -48,6 +49,11 @@ function(op_library TARGET) ...@@ -48,6 +49,11 @@ function(op_library TARGET)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif() endif()
list(LENGTH op_library_DEPS op_library_DEPS_len)
if (${op_library_DEPS_len} GREATER 0)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
...@@ -181,56 +187,26 @@ endfunction() ...@@ -181,56 +187,26 @@ endfunction()
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(nccl) add_subdirectory(nccl)
set(DEPS_OPS if(WITH_GPU)
cond_op op_library(nccl_op DEPS nccl_common)
cross_entropy_op else()
recurrent_op set(DEPS_OPS ${DEPS_OPS} nccl_op)
softmax_with_cross_entropy_op endif()
softmax_op
sequence_softmax_op
sum_op
pool_op
maxout_op
unpool_op
pool_with_index_op
conv_op
conv_transpose_op
nccl_op
sequence_conv_op
sequence_pool_op
lod_rank_table_op
lod_tensor_to_array_op
array_to_lod_tensor_op
max_sequence_len_op
lstm_op
tensor_array_read_write_op
gru_op
adagrad_op
sgd_op
save_op
load_op
send_op
recv_op
cos_sim_op)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
add_subdirectory(detail) add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties( set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
send_op.cc op_library(send_op DEPS ${DISTRIBUTE_DEPS})
PROPERTIES set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
set_source_files_properties( else()
recv_op.cc set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
PROPERTIES
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
endif() endif()
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op DEPS framework_proto tensor net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
...@@ -243,22 +219,17 @@ op_library(pool_op DEPS pooling) ...@@ -243,22 +219,17 @@ op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting) op_library(maxout_op DEPS maxouting)
op_library(unpool_op DEPS unpooling) op_library(unpool_op DEPS unpooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table) op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
endif()
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col) op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) op_library(recurrent_op DEPS executor)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
...@@ -271,13 +242,12 @@ endforeach() ...@@ -271,13 +242,12 @@ endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU) if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
...@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { ...@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
size_t offset; size_t offset;
if (platform::is_gpu_place(i_tensor.place())) { if (platform::is_gpu_place(i_tensor.place())) {
......
...@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
} }
auto slice = out->Slice(out_offset, out_offset + len); auto slice = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice); dev_ctx, &slice);
......
...@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { ...@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
out != nullptr, out != nullptr,
"The Output(Out) should not be null if the Input(X) is set."); "The Output(Out) should not be null if the Input(X) is set.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
} }
......
...@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Borrow(dev_place); auto& dev_ctx = *pool.Get(dev_place);
framework::ExecutionContext ctx(*this, scope, dev_ctx); framework::ExecutionContext ctx(*this, scope, dev_ctx);
......
...@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, ...@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
void CondOp::Run(const Scope& scope, const platform::Place& place) const { void CondOp::Run(const Scope& scope, const platform::Place& place) const {
// get device context from pool // get device context from pool
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Borrow(place); auto& dev_ctx = *pool.Get(place);
PrepareDataForSubnet(scope, dev_ctx); PrepareDataForSubnet(scope, dev_ctx);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope); std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
......
...@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int groups = ctx->Attrs().Get<int>("groups"); int groups = ctx->Attrs().Get<int>("groups");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations"); std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
int input_channels = in_dims[1];
int output_channels = filter_dims[0];
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"Conv intput should be 4-D or 5-D tensor."); "Conv intput should be 4-D or 5-D tensor.");
...@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddings.size(), strides.size(), paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same."); "Conv paddings dimension and Conv strides dimension should be the same.");
int input_channels = in_dims[1];
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
output_channels % groups, 0, output_channels % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
......
...@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, " "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
"where N is the batch size and D is the number of classes. " " where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, " "This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator."); "which is almost always the result of a softmax operator.");
AddInput("Label", AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. When " "(Tensor), the ground truth which is a 2-D tensor. When "
"soft_label is set to false, Label is a Tensor<int64> with shape " "soft_label is set to false, Label is a Tensor<int64> with shape "
"[N x 1]. When soft_label is set to true, Label is a " "[N x 1]. When soft_label is set to true, Label is a "
"Tensor<float/double> with shape [N x K]."); "Tensor<float/double> with shape [N x D].");
AddOutput("Y", AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor with shape " "(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The cross entropy loss."); "[N x 1]. The cross entropy loss.");
......
...@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { ...@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
auto *out_item = out_var->GetMutable<framework::FeedFetchType>(); auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(feed_item, place, dev_ctx, out_item); framework::CopyFrom(feed_item, place, dev_ctx, out_item);
out_item->set_lod(feed_item.lod()); out_item->set_lod(feed_item.lod());
......
...@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { ...@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate // FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs? // CPU outputs?
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait(); dev_ctx.Wait();
......
...@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
} }
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
math::set_constant(dev_ctx, &out, value); math::set_constant(dev_ctx, &out, value);
} }
}; };
......
...@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { ...@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
if (!force_cpu && platform::is_gpu_place(place)) { if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out // Copy tensor to out
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(tensor, place, dev_ctx, &out); framework::CopyFrom(tensor, place, dev_ctx, &out);
} }
} }
......
...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase { ...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
out_var_name); out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
framework::DeserializeFromStream(fin, tensor); DeserializeFromStream(fin, tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
// copy CPU to GPU // copy CPU to GPU
......
...@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
auto slice = out[i].Slice(static_cast<int>(offset), auto slice = out[i].Slice(static_cast<int>(offset),
static_cast<int>(offset + len)); static_cast<int>(offset + len));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin), framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)), static_cast<int>(each_range.end)),
......
...@@ -9,9 +9,9 @@ if(WITH_GPU) ...@@ -9,9 +9,9 @@ if(WITH_GPU)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
...@@ -24,9 +24,9 @@ else() ...@@ -24,9 +24,9 @@ else()
cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context)
......
...@@ -302,8 +302,29 @@ void set_constant(const platform::DeviceContext& context, ...@@ -302,8 +302,29 @@ void set_constant(const platform::DeviceContext& context,
#endif #endif
} }
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
auto out = framework::EigenMatrix<T>::From(*output);
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
}
}
};
template struct RowwiseAdd<platform::CPUDeviceContext, float>; template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>; template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>; template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>; template struct ColwiseSum<platform::CPUDeviceContext, double>;
......
...@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>( ...@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
TensorSetConstantGPU(context, tensor, value)); TensorSetConstantGPU(context, tensor, value));
} }
template <typename T>
__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
int num) {
T tmp = 1.0 / width;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
int h = i * tmp;
int w = i - h * width;
c[i] = a[i] + b[w];
}
}
template <typename T>
struct RowwiseAdd<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
int blocks = 512;
int grids = (input.numel() + blocks - 1) / blocks;
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
input.data<T>(), vector.data<T>(), output->data<T>(),
static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
}
};
template struct RowwiseAdd<platform::CUDADeviceContext, float>; template struct RowwiseAdd<platform::CUDADeviceContext, float>;
template struct RowwiseAdd<platform::CUDADeviceContext, double>; template struct RowwiseAdd<platform::CUDADeviceContext, double>;
template struct ColwiseSum<platform::CUDADeviceContext, float>; template struct ColwiseSum<platform::CUDADeviceContext, float>;
......
...@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()( ...@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
eigen_out.device(*dev) = eigen_in.shuffle(permute); eigen_out.device(*dev) = eigen_in.shuffle(permute);
} }
template <typename DeviceContext, typename T>
void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenMatrix<T>::From(vector);
auto out = framework::EigenMatrix<T>::From(*output);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
out.device(*context.eigen_device()) =
in + vec.reshape(shape).broadcast(bcast);
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context, void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
......
...@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>(); auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
......
...@@ -305,7 +305,7 @@ int main(int argc, char **argv) { ...@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
......
...@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { ...@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
false /*create_local_scope*/); false /*create_local_scope*/);
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
...@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
for (size_t step_id = 0; step_id < seq_len; ++step_id) { for (size_t step_id = 0; step_id < seq_len; ++step_id) {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
......
...@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { ...@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
auto x_sliced = x.Slice(x_offset, x_offset + len); auto x_sliced = x.Slice(x_offset, x_offset + len);
auto out_sliced = out->Slice(out_offset, out_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
out_offset += len; out_offset += len;
return out_offset; return out_offset;
......
...@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { ...@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
auto &tensor = var->Get<framework::LoDTensor>(); auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::SerializeToStream(fout, tensor, dev_ctx); framework::SerializeToStream(fout, tensor, dev_ctx);
} }
......
...@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
This operator will recv tensor from send_op This operator will send tensor to recv_op.
)DOC"); )DOC");
AddAttr<std::vector<std::string>>("endpoints", AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
......
...@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
if (dout_var == nullptr) { // dx_tensor fill zero if (dout_var == nullptr) { // dx_tensor fill zero
math::set_constant(dev_ctx, &dx_tensor, 0.0f); math::set_constant(dev_ctx, &dx_tensor, 0.0f);
......
...@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
auto &x_lod = x.lod(); auto &x_lod = x.lod();
auto &mask_dim = mask.dims(); auto &mask_dim = mask.dims();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
if (platform::is_cpu_place(mask.place())) { if (platform::is_cpu_place(mask.place())) {
......
...@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { ...@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
if (x_tensor.memory_size() > 0) { if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
CopyFrom(x_tensor, place, dev_ctx, out_tensor); CopyFrom(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod()); out_tensor->set_lod(x_tensor.lod());
...@@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp {
auto *out_tensor = out->GetMutable<framework::LoDTensor>(); auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, place); size_t offset = GetOffset(scope, place);
if (offset < x_array.size()) { if (offset < x_array.size()) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
......
...@@ -17,7 +17,7 @@ namespace platform { ...@@ -17,7 +17,7 @@ namespace platform {
DeviceContextPool* DeviceContextPool::pool = nullptr; DeviceContextPool* DeviceContextPool::pool = nullptr;
const platform::DeviceContext* DeviceContextPool::Borrow( const platform::DeviceContext* DeviceContextPool::Get(
const platform::Place& place) { const platform::Place& place) {
auto it = device_contexts_.find(place); auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) { if (it == device_contexts_.end()) {
...@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( ...@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
return it->second; return it->second;
} }
std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
std::vector<const platform::DeviceContext*> borrowed_contexts;
for (auto& place : places) {
auto it = device_contexts_.find(place);
if (it != device_contexts_.end()) {
borrowed_contexts.emplace_back(it->second);
} else {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
}
return borrowed_contexts;
}
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
......
...@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext { ...@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
}; };
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<platform::CPUPlace> {
using TYPE = CPUDeviceContext;
};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice; class EigenCudaStreamDevice;
...@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
}; };
template <>
struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext;
};
class CUDNNDeviceContext : public CUDADeviceContext { class CUDNNDeviceContext : public CUDADeviceContext {
public: public:
explicit CUDNNDeviceContext(CUDAPlace place); explicit CUDNNDeviceContext(CUDAPlace place);
...@@ -109,13 +122,13 @@ class DeviceContextPool { ...@@ -109,13 +122,13 @@ class DeviceContextPool {
public: public:
explicit DeviceContextPool(const std::vector<platform::Place>& places); explicit DeviceContextPool(const std::vector<platform::Place>& places);
static DeviceContextPool& Get() { static DeviceContextPool& Instance() {
PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
return *pool; return *pool;
} }
/*! \brief Create should only called by Init function */ /*! \brief Create should only called by Init function */
static DeviceContextPool& Create(const std::vector<platform::Place>& places) { static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
if (pool == nullptr) { if (pool == nullptr) {
pool = new DeviceContextPool(places); pool = new DeviceContextPool(places);
} }
...@@ -123,13 +136,14 @@ class DeviceContextPool { ...@@ -123,13 +136,14 @@ class DeviceContextPool {
} }
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
const platform::DeviceContext* Borrow(const platform::Place& place); const platform::DeviceContext* Get(const platform::Place& place);
/*! \brief Return handle of multi-device context. */
std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places);
~DeviceContextPool() {} template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) {
return reinterpret_cast<
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
}
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
......
...@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { ...@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
DeviceContextPool& pool = DeviceContextPool::Get(); DeviceContextPool& pool = DeviceContextPool::Instance();
auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); auto cpu_dev_ctx1 = pool.Get(CPUPlace());
auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); auto cpu_dev_ctx2 = pool.Get(CPUPlace());
EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
std::vector<Place> gpu_places; std::vector<Place> gpu_places;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
gpu_places.emplace_back(CUDAPlace(i)); auto dev_ctx = pool.Get(CUDAPlace(i));
} ASSERT_NE(dev_ctx, nullptr);
auto dev_ctxs = pool.Borrow(gpu_places);
for (size_t i = 0; i < dev_ctxs.size(); ++i) {
auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
// check same as CUDAPlace(i)
CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
} }
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
"device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places; std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace()); places.emplace_back(paddle::platform::CPUPlace());
...@@ -109,7 +94,7 @@ int main(int argc, char** argv) { ...@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
......
...@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> { ...@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
constexpr size_t num_threads = 1024; constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads; int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads; int grid_size = (limit_ + num_threads - 1) / num_threads;
......
...@@ -144,7 +144,7 @@ int main(int argc, char** argv) { ...@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h" #include "paddle/platform/variant.h"
namespace paddle { namespace paddle {
...@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &); ...@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);
std::ostream &operator<<(std::ostream &, const Place &); std::ostream &operator<<(std::ostream &, const Place &);
template <typename Visitor>
struct PlaceVisitorWrapper
: public boost::static_visitor<typename Visitor::result_type> {
const Visitor &visitor_;
explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
typename Visitor::result_type operator()(const CPUPlace &cpu) const {
return visitor_(cpu);
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA
return visitor_(cuda);
#else
PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
return typename Visitor::result_type();
#endif
}
};
template <typename Visitor>
typename Visitor::result_type VisitPlace(const Place &place,
const Visitor &visitor) {
return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -3,6 +3,7 @@ if(WITH_PYTHON) ...@@ -3,6 +3,7 @@ if(WITH_PYTHON)
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init DEPS pybind python backward proto_desc paddle_memory executor prune init
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
target_link_libraries(paddle_pybind rt)
endif(WITH_PYTHON) endif(WITH_PYTHON)
if(WITH_DOC) if(WITH_DOC)
......
...@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) { ...@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
std::string name = byte_name; std::string name = byte_name;
return self.HasVar(name); return self.HasVar(name);
}) })
.def("has_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name;
return self.HasVarRecursive(name);
})
.def("find_var", .def("find_var",
[](BlockDesc &self, py::bytes byte_name) { [](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.FindVar(name); return self.FindVar(name);
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("find_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name;
return self.FindVarRecursive(name);
},
py::return_value_policy::reference)
.def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference) .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
.def("op_size", &BlockDesc::OpSize) .def("op_size", &BlockDesc::OpSize)
.def("op", &BlockDesc::Op, py::return_value_policy::reference) .def("op", &BlockDesc::Op, py::return_value_policy::reference)
...@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) { ...@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
.def("set_shape", &VarDesc::SetShape) .def("set_shape", &VarDesc::SetShape)
.def("set_dtype", &VarDesc::SetDataType) .def("set_dtype", &VarDesc::SetDataType)
.def("shape", &VarDesc::Shape, py::return_value_policy::reference) .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType) .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
.def("lod_level", &VarDesc::GetLodLevel) .def("lod_level", &VarDesc::GetLodLevel)
.def("set_lod_level", &VarDesc::SetLoDLevel) .def("set_lod_level", &VarDesc::SetLoDLevel)
.def("type", &VarDesc::GetType) .def("type", &VarDesc::GetType)
...@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) { ...@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
.value("BLOCK", proto::AttrType::BLOCK); .value("BLOCK", proto::AttrType::BLOCK);
py::class_<OpDesc> op_desc(m, "OpDesc", ""); py::class_<OpDesc> op_desc(m, "OpDesc", "");
op_desc.def("type", &OpDesc::Type) op_desc
.def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
py::return_value_policy::reference)
.def("copy_from", &OpDesc::CopyFrom)
.def("type", &OpDesc::Type)
.def("set_type", &OpDesc::SetType) .def("set_type", &OpDesc::SetType)
.def("input", &OpDesc::Input) .def("input", &OpDesc::Input)
.def("input_names", &OpDesc::InputNames) .def("input_names", &OpDesc::InputNames)
.def("set_input", &OpDesc::SetInput)
.def("output", &OpDesc::Output) .def("output", &OpDesc::Output)
.def("output_names", &OpDesc::OutputNames) .def("output_names", &OpDesc::OutputNames)
.def("set_input", &OpDesc::SetInput)
.def("set_output", &OpDesc::SetOutput) .def("set_output", &OpDesc::SetOutput)
.def("input_arg_names", &OpDesc::InputArgumentNames)
.def("output_arg_names", &OpDesc::OutputArgumentNames)
.def("rename_input", &OpDesc::RenameInput)
.def("rename_output", &OpDesc::RenameOutput)
.def("has_attr", &OpDesc::HasAttr) .def("has_attr", &OpDesc::HasAttr)
.def("attr_type", &OpDesc::GetAttrType) .def("attr_type", &OpDesc::GetAttrType)
.def("attr_names", &OpDesc::AttrNames) .def("attr_names", &OpDesc::AttrNames)
......
...@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle.
} }
return ret_values; return ret_values;
}); });
m.def("get_grad_op_descs", m.def(
[](const OpDesc &op_desc, "get_grad_op_desc", [](const OpDesc &op_desc,
const std::unordered_set<std::string> &no_grad_set, const std::unordered_set<std::string> &no_grad_set,
std::unordered_map<std::string, std::string> &grad_to_var, const std::vector<BlockDesc *> &grad_sub_block) {
const std::vector<BlockDesc *> &grad_sub_block) { std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<OpDesc>> grad_op_descs = std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance()
.Get(op_desc.Type()) .Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, &grad_to_var, .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
grad_sub_block); grad_sub_block);
std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size()); std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
std::transform( std::transform(grad_op_descs.begin(), grad_op_descs.end(),
grad_op_descs.begin(), grad_op_descs.end(), grad_op_desc_ptrs.begin(),
grad_op_desc_ptrs.begin(), [](std::unique_ptr<OpDesc> &p) { return p.release(); });
[](std::unique_ptr<OpDesc> &p) { return p.release(); }); return std::make_pair(grad_op_desc_ptrs, grad_to_var);
return grad_op_desc_ptrs; });
});
m.def("prune", [](const ProgramDesc &origin, m.def("prune", [](const ProgramDesc &origin,
const std::vector<std::array<size_t, 2>> &targets) { const std::vector<std::array<size_t, 2>> &targets) {
ProgramDesc prog_with_targets(origin); ProgramDesc prog_with_targets(origin);
...@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
InferenceOptimize(*(origin.Proto()), &pruned_desc); InferenceOptimize(*(origin.Proto()), &pruned_desc);
return new ProgramDesc(pruned_desc); return new ProgramDesc(pruned_desc);
}); });
m.def("empty_var_name", []() { return framework::kEmptyVarName; });
m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
m.def_submodule( m.def_submodule(
"var_names", "var_names",
"The module will return special predefined variable name in Paddle") "The module will return special predefined variable name in Paddle")
......
...@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>( auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
pool.Borrow(tensor.place())); pool.Get(tensor.place()));
paddle::platform::GpuMemcpyAsync( paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
...@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray( ...@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx = auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
} }
......
...@@ -178,7 +178,7 @@ EOF ...@@ -178,7 +178,7 @@ EOF
# run paddle version to install python packages first # run paddle version to install python packages first
RUN apt-get update &&\ RUN apt-get update &&\
${NCCL_DEPS}\ ${NCCL_DEPS}\
apt-get install -y wget python-pip dmidecode && pip install -U pip && \ apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
pip install /*.whl; apt-get install -f -y && \ pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \ apt-get clean -y && \
rm -f /*.whl && \ rm -f /*.whl && \
......
...@@ -71,9 +71,7 @@ function threads_config() { ...@@ -71,9 +71,7 @@ function threads_config() {
# auto set OMP_NUM_THREADS and MKL_NUM_THREADS # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
# according to trainer_count and total processors # according to trainer_count and total processors
# only when MKL enabled # only when MKL enabled
if [ "@WITH_MKL@" == "OFF" ]; then # auto set OPENBLAS_NUM_THREADS when do not use MKL
return 0
fi
processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
if [ -z $trainers ]; then if [ -z $trainers ]; then
...@@ -83,12 +81,19 @@ function threads_config() { ...@@ -83,12 +81,19 @@ function threads_config() {
if [ $threads -eq 0 ]; then if [ $threads -eq 0 ]; then
threads=1 threads=1
fi fi
if [ -z "$OMP_NUM_THREADS" ]; then if [ "@WITH_MKL@" == "ON" ]; then
export OMP_NUM_THREADS=$threads if [ -z "$OMP_NUM_THREADS" ]; then
fi export OMP_NUM_THREADS=$threads
if [ -z "$MKL_NUM_THREADS" ]; then fi
export MKL_NUM_THREADS=$threads if [ -z "$MKL_NUM_THREADS" ]; then
export MKL_NUM_THREADS=$threads
fi
else
if [ -z "$OPENBLAS_NUM_THREADS" ]; then
export OPENBLAS_NUM_THREADS=$threads
fi
fi fi
} }
PADDLE_CONF_HOME="$HOME/.config/paddle" PADDLE_CONF_HOME="$HOME/.config/paddle"
...@@ -150,7 +155,7 @@ fi ...@@ -150,7 +155,7 @@ fi
case "$1" in case "$1" in
"train") "train")
threads_config $@ threads_config $@
# echo $OMP_NUM_THREADS $MKL_NUM_THREADS # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
;; ;;
"merge_model") "merge_model")
......
...@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid'] ...@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
DATA_MD5 = '52808999861908f626f3c1f4e79d11fa' DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
# In official 'readme', tstid is the flag of test data # In official 'readme', tstid is the flag of test data
......
...@@ -36,7 +36,7 @@ def __read_gflags_from_env__(): ...@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
""" """
import sys import sys
import core import core
read_env_flags = ['use_pinned_memory'] read_env_flags = ['use_pinned_memory', 'check_nan_inf']
if core.is_compile_gpu(): if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use') read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
......
from paddle.v2.fluid import framework as framework from paddle.v2.fluid import framework as framework
from . import core
import collections
__all__ = ['append_backward_ops'] __all__ = ['append_backward']
def append_backward_ops(loss, parameter_list=None, no_grad_set=None): def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None,
end_idx=None):
if begin_idx is None:
begin_idx = 0
if end_idx is None:
end_idx = len(op_desc_list)
for i in range(begin_idx, end_idx):
op_desc = op_desc_list[i]
if isinstance(op_desc, tuple):
op_desc = op_desc[0]
op_desc.rename_input(old_name, new_name)
op_desc.rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs):
op_desc = core.OpDesc()
op_desc.set_type(op_type)
for para, args in inputs.iteritems():
op_desc.set_input(para, args)
for para, args in outputs.iteritems():
op_desc.set_output(para, args)
for name, val in attrs.iteritems():
if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc)
else:
op_desc.set_attr(name, val)
return op_desc
def _infer_var_data_type_(var_name, block):
grad_var = block.desc.find_var(var_name.encode("ascii"))
fwd_name = _strip_grad_suffix_(var_name.encode("ascii"))
if block.desc.has_var_recursive(fwd_name):
fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
grad_var.set_dtype(fwd_var.dtype())
else:
grad_var.set_dtype(core.DataType.FP32)
def _all_in_set_(cands, s):
for c in cands:
if not c in s:
return False
return True
def _strip_grad_suffix_(name):
pos = name.find(core.grad_var_suffix())
return name[:pos] if pos != -1 else name
def _append_grad_suffix_(name):
return name + core.grad_var_suffix()
def _addup_repetitive_outputs_(op_descs):
# In backward part, an variable my be the output of more than one ops.
# In this case, the variable should be the accumulation of all the outputs.
# We adopt adding `sum_op`s to implement the accumulate.
pending_sum_ops = []
var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list)
for idx, op_desc in enumerate(op_descs):
for var_name in op_desc.input_arg_names():
if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append(
(_create_op_desc_("sum", {"X": renamed_vars[var_name]},
{"Out": [var_name]}, {}), idx))
renamed_vars[var_name] = [var_name]
for var_name in op_desc.output_arg_names():
if var_name == core.empty_var_name(
) or var_name in op_desc.input_arg_names():
# empty variable or inplace op
continue
if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable
renamed_vars[var_name] = [var_name]
else:
if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx)
_rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems():
if len(inputs) > 1:
pending_sum_ops.append((_create_op_desc_(
"sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
# sum_op descs are sorted according to their insert position
for p in reversed(pending_sum_ops):
op_descs.insert(p[1], p[0])
return op_descs
def _remove_no_grad_branch_(op_descs, no_grad_set):
# Remove ops whose outputs are all in no_grad_dict
op_descs = filter(
lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set),
op_descs)
# Insert fill_zeros_like_op
to_insert = []
for idx, op_desc in enumerate(op_descs):
for arg in op_desc.input_arg_names():
if core.grad_var_suffix() in arg and arg in no_grad_set:
to_insert.append((_create_op_desc_("fill_zeros_like", {
"X": [_strip_grad_suffix_(arg)]
}, {"Y": [arg]}, {}), idx))
map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
return op_descs
def _append_backward_ops_(target,
block,
target_block,
no_grad_dict,
grad_to_var,
callback=None):
grad_op_descs = []
program = block.program
for op in reversed(block.ops):
grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block"))
grad_sub_block = program.create_block(parent_idx=sub_block.idx)
_append_backward_ops_(target, sub_block, grad_sub_block,
no_grad_dict, grad_to_var, callback)
grad_sub_block_list.append(grad_sub_block.desc)
grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, no_grad_dict[block.idx], grad_sub_block_list)
grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var)
grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
no_grad_dict[block.idx])
if target_block.idx == 0:
grad_op_descs.insert(
0,
_create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(target.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": target.dtype}))
# append op_desc in grad_op_descs to target_block
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set()
# create new gradient variables
for grad_var_name in op_desc.output_arg_names():
grad_var_name = grad_var_name.encode("ascii")
if block.desc.has_var_recursive(
grad_var_name) or grad_var_name == core.empty_var_name():
continue
block.desc.var(grad_var_name)
new_vars.add(grad_var_name)
if not grad_to_var.has_key(grad_var_name):
continue
grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
# infer_shape and infer_type
op_desc.infer_var_type(block.desc)
op_desc.infer_shape(block.desc)
for arg in op_desc.output_arg_names():
if arg in new_vars:
_infer_var_data_type_(arg, block)
def append_backward(loss, parameter_list=None, no_grad_set=None):
""" """
Create and add gradient Operators in BlockDesc to compute Create and add gradient Operators in BlockDesc to compute
gradients of `loss` for parameters in parameter_list gradients of `loss` for parameters in parameter_list
:param loss: an variable generated by cost function. :param loss: an variable generated by cost function.
:type loss: Variable :type loss: Variable
:param no_grad_set: variable that should not create gradient :param no_grad_dict: variable that should not create gradient
:type no_grad_set: set :type no_grad_dict: set
:param parameter_list: parameters that need to compute gradient and :param parameter_list: parameters that need to compute gradient and
update to optimize the lost. update to optimize the lost.
:type: list :type: list
...@@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): ...@@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
""" """
assert isinstance(loss, framework.Variable) assert isinstance(loss, framework.Variable)
program = loss.block.program
no_grad_dict = dict()
if no_grad_set is None: if no_grad_set is None:
program = loss.block.program
assert isinstance(program, framework.Program) assert isinstance(program, framework.Program)
no_grad_set = list()
for block in program.blocks: for block in program.blocks:
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues(): for var in block.vars.itervalues():
assert isinstance(var, framework.Variable) assert isinstance(var, framework.Variable)
if var.stop_gradient: if var.stop_gradient:
no_grad_set.append(var.name) block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_set = set(no_grad_set) no_grad_dict[block.idx] = block_no_grad_set
elif isinstance(no_grad_set, set):
no_grad_dict = {0: no_grad_set}
else:
raise ValueError("'no_grad_set' should be a set or None.")
grad_info_map = dict()
root_block = program.block(0)
fwd_op_num = root_block.desc.op_size()
current_block_idx = program.current_block_idx
grad_to_var = dict()
_append_backward_ops_(loss, root_block, root_block, no_grad_dict,
grad_to_var)
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx
program.sync_with_cpp()
param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
if parameter_list is not None: if parameter_list is not None:
parameters = parameter_list parameters = parameter_list
else: else:
params = loss.block.program.global_block().all_parameters() params = program.global_block().all_parameters()
parameters = [param.name for param in params] parameters = [param.name for param in params]
params_and_grads = [] params_and_grads = []
for param in parameters: for param in parameters:
if param not in param_grad_map: if param not in grad_info_map:
raise ValueError("param %s is not in map" % param) raise ValueError("param %s is not in map" % param)
grad_info = param_grad_map[param] grad_info = grad_info_map[param]
grad_block = loss.block.program.block(grad_info[1]) grad_block = grad_info[1]
if not grad_block.has_var(grad_info[0]): if not grad_block.has_var(grad_info[0]):
raise ValueError("grad block[{0}] did not have grad var {1}".format( raise ValueError("grad block[{0}] did not have grad var {1}".format(
grad_info[1], grad_info[0])) grad_info[1], grad_info[0]))
# Get the param var from the global block # Get the param var from the global block
param_var = loss.block.program.global_block().var(param) param_var = program.global_block().var(param)
grad_var = grad_block.var(grad_info[0]) grad_var = grad_block.var(grad_info[0])
if loss.block.has_var(grad_info[0]): if loss.block.has_var(grad_info[0]):
params_and_grads.append((param_var, grad_var)) params_and_grads.append((param_var, grad_var))
......
...@@ -3,7 +3,7 @@ import core ...@@ -3,7 +3,7 @@ import core
import numpy import numpy
import six.moves as six import six.moves as six
from framework import Variable from framework import Variable, default_main_program
__all__ = ['DataFeeder'] __all__ = ['DataFeeder']
...@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object): ...@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
class DataFeeder(object): class DataFeeder(object):
def __init__(self, feed_list, place): def __init__(self, feed_list, place, program=None):
self.feed_dtypes = [] self.feed_dtypes = []
self.feed_names = [] self.feed_names = []
self.feed_shapes = [] self.feed_shapes = []
self.feed_lod_level = [] self.feed_lod_level = []
if program is None:
program = default_main_program()
for each_var in feed_list: for each_var in feed_list:
if isinstance(each_var, basestring):
each_var = program.block(0).var(each_var)
if not isinstance(each_var, Variable): if not isinstance(each_var, Variable):
raise TypeError("Feed list should contain a list of variable") raise TypeError("Feed list should contain a list of variable")
self.feed_dtypes.append(each_var.dtype) self.feed_dtypes.append(each_var.dtype)
......
...@@ -846,9 +846,11 @@ class Program(object): ...@@ -846,9 +846,11 @@ class Program(object):
self.sync_with_cpp() self.sync_with_cpp()
return param_to_grad_info return param_to_grad_info
def create_block(self): def create_block(self, parent_idx=None):
new_block_idx = len(self.blocks) new_block_idx = len(self.blocks)
self.desc.append_block(self.current_block().desc) parent = self.current_block() if parent_idx is None else self.block(
parent_idx)
self.desc.append_block(parent.desc)
self.current_block_idx = new_block_idx self.current_block_idx = new_block_idx
self.blocks.append(Block(self, self.current_block_idx)) self.blocks.append(Block(self, self.current_block_idx))
return self.current_block() return self.current_block()
......
...@@ -188,7 +188,7 @@ def save_inference_model(dirname, ...@@ -188,7 +188,7 @@ def save_inference_model(dirname,
raise ValueError("'feed_var_names' should be a list of str.") raise ValueError("'feed_var_names' should be a list of str.")
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
feeded_var_names = [feeded_var_names] target_vars = [target_vars]
else: else:
if not (bool(target_vars) and all( if not (bool(target_vars) and all(
isinstance(var, Variable) for var in target_vars)): isinstance(var, Variable) for var in target_vars)):
......
...@@ -270,6 +270,7 @@ def gru_unit(input, ...@@ -270,6 +270,7 @@ def gru_unit(input,
attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
# create bias # create bias
if bias is None: if bias is None:
bias_size = [1, 3 * size] bias_size = [1, 3 * size]
bias = helper.create_parameter( bias = helper.create_parameter(
...@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs): ...@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
def cross_entropy(input, label, **kwargs): def cross_entropy(input, label, **kwargs):
""" """
This function computes cross_entropy using the input and label. **Cross Entropy Layer**
This layer computes the cross entropy between `input` and `label`. It supports
both standard cross-entropy and soft-label cross-entropy loss computation.
1) One-hot cross-entropy:
`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
.. math::
Y[i] = -\log(X[i, Label[i]])
2) Soft-label cross-entropy:
`soft_label = True`, `Label[i, j]` indicates the soft label of class j
for sample i:
.. math::
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summation of each row of `label`
equals one.
3) One-hot cross-entropy with vecterized `label`:
As a special case of 2), when each row of 'label' has only one
non-zero element which is equal to 1, soft-label cross-entropy degenerates
to a one-hot cross-entropy with one-hot label representation.
Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes. This input is a probability
computed by the previous operator, which is almost always the result
of a softmax operator.
label (Variable|list): the ground truth which is a 2-D tensor. When
`soft_label` is set to `False`, `label` is a tensor<int64> with shape
[N x 1]. When `soft_label` is set to `True`, `label` is a
tensor<float/double> with shape [N x D].
soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
the given labels as soft labels, default `False`.
Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
`soft_label == True`, and the 2nd dimension of `input` and `label` are not \
equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
Examples:
.. code-block:: python
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
""" """
helper = LayerHelper('cross_entropy', **kwargs) helper = LayerHelper('cross_entropy', **kwargs)
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
...@@ -514,14 +567,83 @@ def conv2d(input, ...@@ -514,14 +567,83 @@ def conv2d(input,
groups=None, groups=None,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
act=None, act=None):
name=None):
""" """
This function creates the op for a 2-dimensional Convolution. **Convlution2D Layer**
This is performed using the parameters of filters(size, dimensionality etc)
, stride and other configurations for a Convolution operation. The convolution2D layer calculates the output based on the input, filter
This funciton can also append an activation on top of the and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
conv-2d output, if mentioned in the input parameters. are in NCHW format. Where N is batch size, C is the number of channels, H is the height
of the feature, and W is the width of the feature.
The details of convolution layer, please refer UFLDL's `convolution,
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
If bias attribution and activation type are provided, bias is added to the output of the convolution,
and the corresponding activation function is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
In the above equation:
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
Input:
Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output:
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
.. math::
H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride. Default: stride = 1.
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding. Default: padding = 0.
groups(int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
act(str): Activation type. Default: None
Returns:
Variable: The tensor variable storing the convolution and \
non-linearity activation result.
Raises:
ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
""" """
if stride is None: if stride is None:
......
from collections import defaultdict from collections import defaultdict
import framework import framework
from backward import append_backward_ops from backward import append_backward
from framework import unique_name, program_guard from framework import unique_name, program_guard
from initializer import Constant from initializer import Constant
from layer_helper import LayerHelper from layer_helper import LayerHelper
...@@ -194,10 +194,10 @@ class Optimizer(object): ...@@ -194,10 +194,10 @@ class Optimizer(object):
no_grad_set=None): no_grad_set=None):
"""Add operations to minimize `loss` by updating `parameter_list`. """Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = append_backward_ops(loss, parameter_list, no_grad_set) params_grads = append_backward(loss, parameter_list, no_grad_set)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
......
...@@ -4,7 +4,7 @@ import random ...@@ -4,7 +4,7 @@ import random
import itertools import itertools
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import collections import collections
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import Program, OpProtoHolder from paddle.v2.fluid.framework import Program, OpProtoHolder
...@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase): ...@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
op_loss.desc.infer_var_type(block.desc) op_loss.desc.infer_var_type(block.desc)
op_loss.desc.infer_shape(block.desc) op_loss.desc.infer_shape(block.desc)
param_grad_list = append_backward_ops( param_grad_list = append_backward(
loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
feed_dict = { feed_dict = {
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.framework import default_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
total_sum = layers.sums(input=[a_sum, x_sum]) total_sum = layers.sums(input=[a_sum, x_sum])
total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
append_backward_ops(total_sum_scaled) append_backward(total_sum_scaled)
g_vars = map(default_main_program().global_block().var, g_vars = map(default_main_program().global_block().var,
[each_x.name + "@GRAD" for each_x in x]) [each_x.name + "@GRAD" for each_x in x])
......
...@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers ...@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.framework import default_startup_program, default_main_program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy import numpy
...@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase): ...@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
outs = exe.run(feed={'X': x}, fetch_list=[out])[0] outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
print outs print outs
loss = layers.mean(x=out) loss = layers.mean(x=out)
append_backward_ops(loss=loss) append_backward(loss=loss)
outs = exe.run( outs = exe.run(
feed={'X': x}, feed={'X': x},
fetch_list=[ fetch_list=[
......
...@@ -4,7 +4,7 @@ import numpy ...@@ -4,7 +4,7 @@ import numpy
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.framework import Program, program_guard
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): ...@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
mean = layers.mean(x=result) mean = layers.mean(x=result)
append_backward_ops(mean) append_backward(mean)
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
...@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer.create_optimization_pass(
...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
momentum_optimizer = self.MockMomentum( momentum_optimizer = self.MockMomentum(
learning_rate=learning_rate, momentum=0.2, use_nesterov=True) learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer.create_optimization_pass(
...@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad( adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6) learning_rate=learning_rate, epsilon=1.0e-6)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adam_optimizer = self.MockAdam( adam_optimizer = self.MockAdam(
learning_rate=learning_rate, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adamax_optimizer = self.MockAdamax( adamax_optimizer = self.MockAdamax(
learning_rate=learning_rate, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
decayed_adagrad_optimizer = self.MockDecayedAdagrad( decayed_adagrad_optimizer = self.MockDecayedAdagrad(
learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6) learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
opts = decayed_adagrad_optimizer.create_optimization_pass( opts = decayed_adagrad_optimizer.create_optimization_pass(
......
...@@ -3,7 +3,7 @@ import unittest ...@@ -3,7 +3,7 @@ import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, grad_var_name from paddle.v2.fluid.framework import Program, grad_var_name
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy as np import numpy as np
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
...@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase): ...@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
def test_backward(self): def test_backward(self):
self.check_forward() self.check_forward()
append_backward_ops(self.output) append_backward(self.output)
ana_grad = [np.array(x) for x in self.backward()] ana_grad = [np.array(x) for x in self.backward()]
......
...@@ -3,7 +3,7 @@ import unittest ...@@ -3,7 +3,7 @@ import unittest
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.regularizer as regularizer import paddle.v2.fluid.regularizer as regularizer
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestL2DecayRegularizer(unittest.TestCase): class TestL2DecayRegularizer(unittest.TestCase):
...@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase): ...@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops) count_ops = len(block.ops)
params_grads = optimizer.append_regularization_ops(params_grads) params_grads = optimizer.append_regularization_ops(params_grads)
...@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase): ...@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops) count_ops = len(block.ops)
params_grads = optimizer.append_regularization_ops(params_grads) params_grads = optimizer.append_regularization_ops(params_grads)
......
...@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
new_dat = fluid.layers.reorder_lod_tensor_by_rank( new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table) x=dat, rank_table=table)
loss = fluid.layers.mean(x=new_dat) loss = fluid.layers.mean(x=new_dat)
fluid.backward.append_backward_ops(loss=loss) fluid.backward.append_backward(loss=loss)
cpu = fluid.CPUPlace() cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu) exe = fluid.Executor(cpu)
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy as np import numpy as np
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.framework import default_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase): ...@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2])) self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
mem3_mean = layers.mean(x=mem3) mem3_mean = layers.mean(x=mem3)
append_backward_ops(loss=mem3_mean) append_backward(loss=mem3_mean)
x_grad = exe.run( x_grad = exe.run(
feed={'x': tensor}, feed={'x': tensor},
fetch_list=[main_program.global_block().var('x@GRAD')])[0] fetch_list=[main_program.global_block().var('x@GRAD')])[0]
......
...@@ -4,7 +4,7 @@ import numpy as np ...@@ -4,7 +4,7 @@ import numpy as np
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.framework import Program, program_guard
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): ...@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
mean = layers.mean(x=out) mean = layers.mean(x=out)
append_backward_ops(mean) append_backward(mean)
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy import numpy
...@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase): ...@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
sum_result = layers.array_read(array=mem_array, i=i) sum_result = layers.array_read(array=mem_array, i=i)
loss = layers.mean(x=sum_result) loss = layers.mean(x=sum_result)
append_backward_ops(loss) append_backward(loss)
cpu = core.CPUPlace() cpu = core.CPUPlace()
exe = Executor(cpu) exe = Executor(cpu)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册