diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 8ee7fd28c58f2a2bcb82040eb824a37062bd4e9c..6cc9598947acbdacfbf4c4379987bab8ed7611b0 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -22,6 +22,7 @@ On each machine, we will test and compare the performance of training on single #### Training Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet. Input image size - 3 * 224 * 224, Time: images/second @@ -55,6 +56,16 @@ Input image size - 3 * 224 * 224, Time: images/second +- Alexnet + +| BatchSize | 64 | 128 | 256 | +|--------------|--------| ------ | -------| +| OpenBLAS | 2.13 | 2.45 | 2.68 | +| MKLML | 66.37 | 105.60 | 144.04 | +| MKL-DNN | 399.00 | 498.94 | 626.53 | + +chart TBD + #### Inference Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz - VGG-19 diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md new file mode 100644 index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5 --- /dev/null +++ b/doc/design/mkl/mkldnn_fluid.md @@ -0,0 +1,149 @@ +# Design Doc: Add MKLDNN Kernel in Fluid Operator + +## Principles + +First of all, we should follow some basical principles like: +1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc. +2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h). +3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels. +4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`. + +## Sulution + +In general, there are four parts we should follow to run a MKL-DNN primitive. +- Create a primitive descriptor that describe this operator +- Create a primitive itself by primitive descriptor and the engine +- Create all memory buffers that primitive needed +- Launch a stream to execute the primitive created +More details can refer to [here](http://01org.github.io/mkl-dnn). + +It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \ +So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822). + +It's assumed that following three conditions should be satisfied. +1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`. +2. the `Input Tensor` inside `Compute` function is the one after converted. +3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user. + +### Compute +The algorithm of `Compute` would be described as follow, let's take conv like an example. + +```c++ + + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library."); + + auto& dev_ctx = ctx.template device_context(); + + // find primitive by unique key from mkldnn context + // the op_key should be a unique name of this op instance + auto& p = dev_ctx.findPrimitive(op_key + "_fwd"); + + // assuming the input tensor inside this compute function is the one after converted + // this point should be guarantee by another mechanism + auto& i = dev_ctx.findMemory(op_key + "_input"); + + if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) { + auto fwd_primitive_desc = createPrimitiveDesc(ctx); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data())); + shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data())); + shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace()))); + shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out)); + + dev_ctx.addMemory(op_key+"_input", in); + dev_ctx.addMemory(op_key+"_output", out); + dev_ctx.addMemory(op_key+"_filer", wgt); + dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive); + dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); + } + + p = dev_ctx.findPrimitive(op_key + "_fwd"); + + PADDLE_ENFORCE(p, "Should have forward Primitive"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); + PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); + dev_ctx.submit(p); + dev_ctx.execute(); // the convert primitive should have already contained. + +``` + +The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this: +```c++ + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + algorithm algo = static_cast(ctx.Attr("convolution_algorithm_option")); + prop_kind pk = ctx.Attr("is_test") ? prop_kind::forward_inference : prop_kind::forward_training; + + auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/); + shared_ptr fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine())); + + return fwd_primitive_desc; + } +``` + +### MKLDNNDeviceContext +`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed. + + +### mkldnn_helper +Some functions would be put in `paddle/platform/mkldnn_helper.h`. +- create MKLDNN memories +- create MKLDNN primitives +- error check function +- etc + + +### Kernel Switch +We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it. + +`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. +`trans` would be like this: + +```c++ +void trans(inputs, ctx) override { + if (NoNeedTrans()) { + return; + } + // find reorder primitive by op_key from context + auto& dev_ctx = ctx.template device_context(); + auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input"); + auto& i = dev_ctx.findMemory(op_key + "_src_input"); + + if (p == nullptr || i == nullptr || changeSized(i, input)) { + auto prim = createPrimitiveDesc(ctx); + auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data); + auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes()); + auto dst = createMemory(p->expected_desc(), newbuffer->data); + auto reorder_primitive(new mkldnn::reorder(src, dst)); + + dev_ctx.addMemory(op_key+"_src_input", src); + dev_ctx.addMemory(op_key+"_input", dst); + dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); + } + + p = dev_ctx.findPrimitive(op_key + "_reorder_input"); + PADDLE_ENFORCE(p, "Should have Reorder Primitive"); + dev_ctx.submit(p); + if (! this->isMKLDNNKernel()) { + // execute immediately only if this is not mkldnn kernel function. + // otherwise, it can be executed with the operator primitive in Compute + dev_ctx.stream(); + } + // after submit, the input tensor in ExecutionContext should be changed as the converted one + // there should be another mechanism to ensure this +} +``` + +### Unit Test +All the functions should be tested corresponding. +TBD diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst index a2bdeead7841393fdfe90c78e5b91d9e61678a24..ed8a0c7e87da133138ecfc7ba6a8217d58b8f71d 100644 --- a/doc/faq/build_and_install/index_cn.rst +++ b/doc/faq/build_and_install/index_cn.rst @@ -109,3 +109,31 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二 解决办法是: * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 + +8. 下载MKLML库失败 +------------------ + +.. code-block:: bash + + make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4 + make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2 + make[1]: *** 正在等待未完成的任务.... + +原因:网速或SSL链接原因,导致MKLML库下载不成功。 + +解决办法是:手动下载并安装,具体步骤如下。 + +.. code-block:: bash + + // 1. 进入对应的目录 + cd build/third_party/mklml/src/extern_mklml + + // 2. 查看包的大小, 正常情况下是75M,如果小于75M,即下载失败: + du -sh mklml_lnx_2018.0.1.20171007.tgz + + // 3. 手动下载且解压缩,并手动生成download成功标签: + wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz + tar zxf mklml_lnx_2018.0.1.20171007.tgz + touch ../extern_mklml-stamp/extern_mklml-download + + // 4. 接着编译即可 diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 25a0db27688a9984ad401cbd462fe70048b204da..c2a57a95ee6aa1b03a687f07de74810e8e753f29 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -59,7 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_test(threadpool_test SRCS threadpool_test.cc) +cc_library(threadpool SRCS threadpool.cc) +cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h index 7d7a444cf02ba0da88178a34c98f5ef7a95c3852..4a8669c3a41fceaad26878a79eabfd0affce86fd 100644 --- a/paddle/framework/data_layout.h +++ b/paddle/framework/data_layout.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/platform/enforce.h" #include #include "paddle/platform/enforce.h" @@ -20,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace framework { -enum DataLayout { +enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, @@ -38,11 +39,11 @@ inline DataLayout StringToDataLayout(const std::string& str) { inline std::string DataLayoutToString(const DataLayout& data_layout) { switch (data_layout) { - case kNHWC: + case DataLayout::kNHWC: return "NHWC"; - case kNCHW: + case DataLayout::kNCHW: return "NCHW"; - case kAnyLayout: + case DataLayout::kAnyLayout: return "ANY_LAYOUT"; default: PADDLE_THROW("unknown DataLayou %d", data_layout); diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h index aa66cf00f3be14c4ac9496326190428688fc3496..6baae6c2bb80a4f631cad89231ce4fcb8d94ed86 100644 --- a/paddle/framework/library_type.h +++ b/paddle/framework/library_type.h @@ -20,15 +20,15 @@ namespace framework { // For more details about the design of LibraryType, Please refer to // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library -enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; +enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; inline std::string LibraryTypeToString(const LibraryType& library_type) { switch (library_type) { - case kPlain: + case LibraryType::kPlain: return "PLAIN"; - case kMKLDNN: + case LibraryType::kMKLDNN: return "MKLDNN"; - case kCUDNN: + case LibraryType::kCUDNN: return "CUDNN"; default: PADDLE_THROW("unknown LibraryType %d", library_type); diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index e9c45b958cd0a65bca62099324e951f298d9ecb1..97b542e345feab0bab701dd967558ce23375dc7f 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -40,6 +40,7 @@ struct OpKernelType { // place, data_type, library_type kinds less than 2^8 constexpr static int LEFT_SHIFT = 8; + proto::DataType data_type_; DataLayout data_layout_; platform::Place place_; diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 6a0c5133c9a6bb326ca51755242e75b6eb9e5474..b9f6884f7c4b6eeacda722762d485dea97fdcb63 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -20,12 +20,12 @@ limitations under the License. */ #include #include +#include "paddle/framework/data_layout.h" #include "paddle/framework/ddim.h" #include "paddle/memory/memory.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" -#include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -115,6 +115,10 @@ class Tensor { inline void check_memory_size() const; + inline DataLayout layout() const { return layout_; } + + inline void set_layout(const DataLayout layout) { layout_ = layout; } + private: friend class LoDTensor; @@ -173,6 +177,19 @@ class Tensor { DDim dims_; + /** + * @brief the layout of memory block, default is NCHW. + * + * @note the memory allocation order, describe how weight/data is stored + * For example, in 4-D Tensor(rank=4), there are three commonly + * used layout. They are + * NCHW, NHWC, CHWN. + * N,C,H,W for respectively the batch size, the number of + * feature maps, the height. + */ + + DataLayout layout_ = DataLayout::kNHWC; + /** * @brief A PlaceHolder may be shared by more than one tensor. * diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 3d93b7808bc96143b5261e1ec41ad98b15c74975..6c6f298edc187a87677089e54c4c9046821282df 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -165,6 +165,7 @@ inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { size_t base = numel() / dims_[0]; Tensor dst; dst.holder_ = holder_; + dst.set_layout(layout_); DDim dst_dims = dims_; dst_dims[0] = end_idx - begin_idx; dst.Resize(dst_dims); diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index f347981f2e11fdc4770df8d5e7277cd55744ab3f..ca76a9fcb9079bab22f7b192c45903852c91797f 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -200,3 +200,12 @@ TEST(Tensor, ReshapeToMatrix) { ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } + +TEST(Tensor, Layout) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + ASSERT_EQ(src.layout(), DataLayout::kNHWC); + src.set_layout(DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), DataLayout::kAnyLayout); +} diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index ebfb0e553877b776afe13d8a4e7a7ffa8710405e..692f5f1af7e0225a63d4ff7d47129d572c237f61 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -33,6 +33,7 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, src.check_memory_size(); dst->Resize(src.dims()); + dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); @@ -89,6 +90,7 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { src.check_memory_size(); dst->Resize(src.dims()); + dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index 6fc243aaf6ea68d716e7bb6a73289197fde56247..f388c19f28ed28335818733f946d8eaf18464627 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -28,6 +28,7 @@ TEST(CopyFrom, Tensor) { int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; memcpy(src_ptr, arr, 9 * sizeof(int)); + src_tensor.set_layout(DataLayout::kAnyLayout); auto cpu_place = new platform::CPUPlace(); CopyFrom(src_tensor, *cpu_place, &dst_tensor); @@ -38,14 +39,18 @@ TEST(CopyFrom, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + Tensor slice_tensor = src_tensor.Slice(1, 2); - CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor); + CopyFrom(slice_tensor, *cpu_place, &dst_tensor); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); for (size_t i = 0; i < 3; ++i) { EXPECT_EQ(dst_ptr[i], slice_ptr[i]); } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + #ifdef PADDLE_WITH_CUDA { Tensor src_tensor; @@ -91,6 +96,8 @@ TEST(CopyFrom, Tensor) { for (size_t i = 0; i < 3; ++i) { EXPECT_EQ(dst_ptr[i], slice_ptr[i]); } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); } #endif } diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b9be0646cffb16188e4a66981698e3891d10d51 --- /dev/null +++ b/paddle/framework/threadpool.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/threadpool.h" + +namespace paddle { +namespace framework { + +std::unique_ptr ThreadPool::threadpool(nullptr); +std::once_flag ThreadPool::init_flag; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h index 9a1ece3ae8452399205e71fbaa26977710fcdcac..5f6b2d458f7ee764c22d203f285b78023b6012f3 100644 --- a/paddle/framework/threadpool.h +++ b/paddle/framework/threadpool.h @@ -13,15 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include -#include #include -#include #include #include #include -#include "paddle/platform/call_once.h" #include "paddle/platform/enforce.h" namespace paddle { @@ -81,10 +79,9 @@ class ThreadPool { } private: - ThreadPool& operator=(const ThreadPool&) = delete; - ThreadPool(const ThreadPool&) = delete; + DISABLE_COPY_AND_ASSIGN(ThreadPool); - ThreadPool(int num_threads) + explicit ThreadPool(int num_threads) : num_threads_(num_threads), available_(num_threads), running_(true) { threads_.resize(num_threads); for (auto& thread : threads_) { @@ -155,7 +152,5 @@ class ThreadPool { std::condition_variable completed_; }; -std::unique_ptr ThreadPool::threadpool(nullptr); -std::once_flag ThreadPool::init_flag; } // namespace framework } // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc index 78c762608ed920c1a586c99d68952ffcc653c75e..012d92a5edc415f0bb2f8a0ea38ffeb9549d54fa 100644 --- a/paddle/framework/threadpool_test.cc +++ b/paddle/framework/threadpool_test.cc @@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "threadpool.h" #include #include -#include -#include -#include + +#include "threadpool.h" namespace framework = paddle::framework; diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 45157842a6f92348909498f83d304d53b36c7d47..c4e2c8bb88ec9c74bd782570c10fb217178c8e48 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -13,59 +13,113 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" +#include // for sqrt in CPU and CUDA #include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" +#include "paddle/platform/for_range.h" namespace paddle { namespace operators { +template +struct AdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + inline HOSTDEVICE void operator()(size_t i) const { + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } +}; + template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment1_out_tensor = ctx.Output("Moment1Out"); - auto moment2_out_tensor = ctx.Output("Moment2Out"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment1_out_tensor->mutable_data(ctx.GetPlace()); - moment2_out_tensor->mutable_data(ctx.GetPlace()); + using paddle::framework::LoDTensor; + using paddle::operators::detail::Ref; T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); + auto& param = Ref(ctx.Input("Param"), "Must set Param"); + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); + auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); + auto& lr = + Ref(ctx.Input("LearningRate"), "Must set LearningRate"); + + auto& beta1_pow = + Ref(ctx.Input("Beta1Pow"), "Must set Beta1Pow"); + auto& beta2_pow = + Ref(ctx.Input("Beta2Pow"), "Must set Beta2Pow"); + + auto& param_out = + Ref(ctx.Output("ParamOut"), "Must set ParamOut"); + auto& mom1_out = + Ref(ctx.Output("Moment1Out"), "Must set Moment1Out"); + auto& mom2_out = + Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment1 = framework::EigenVector::Flatten( - *ctx.Input("Moment1")); - auto moment2 = framework::EigenVector::Flatten( - *ctx.Input("Moment2")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto beta2_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta2Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); - auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad; - moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square(); - - // All of these are tensors of 1 element - auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow); - // Eigen does not support automatic broadcast - // Get dimensions of moment vector to broadcast lr_t - Eigen::DSizes m_dsize(moment1_out_tensor->numel()); - param_out.device(*place) = - param - - lr_t.broadcast(m_dsize) * - (moment1_out / (moment2_out.sqrt() + epsilon)); + AdamFunctor functor(beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), + mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), param.numel()); + for_range(functor); } }; diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index 14abd4bf0a6e73a9c0f000f53a5e1e380f01d1c0..c57ee414dc5b3417549c8ac3a7fd57a9c8f452df 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/lstm_compute.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/sequence2batch.h" @@ -102,9 +103,12 @@ class LSTMKernel : public framework::OpKernel { auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto gate_act = ctx.Attr("gate_activation"); - auto cell_act = ctx.Attr("cell_activation"); - auto cand_act = ctx.Attr("candidate_activation"); + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); for (size_t n = 0; n < num_batch; n++) { int bstart = static_cast(batch_starts[n]); @@ -264,9 +268,12 @@ class LSTMGradKernel : public framework::OpKernel { batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); batch_gate_g.set_lod(batch_gate->lod()); - auto gate_act = ctx.Attr("gate_activation"); - auto cell_act = ctx.Attr("cell_activation"); - auto cand_act = ctx.Attr("candidate_activation"); + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h index a20c35d1d9dc4a3a6fae92023fd1aae787a716ec..585a0123437a39c2b610306b18fe0a970c0ed072 100644 --- a/paddle/operators/math/detail/activation_functions.h +++ b/paddle/operators/math/detail/activation_functions.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include "paddle/platform/enforce.h" #include "paddle/platform/hostdevice.h" #ifdef __AVX__ @@ -29,6 +30,26 @@ namespace detail { #define SIGMOID_THRESHOLD_MAX 13.0 #define EXP_MAX_INPUT 40.0 +enum ActivationType { + kSigmoid, + kReLU, + kTanh, + kIdentity, +}; + +inline ActivationType GetActivationType(const std::string &type) { + if (type == "sigmoid") { + return ActivationType::kSigmoid; + } else if (type == "relu") { + return ActivationType::kReLU; + } else if (type == "tanh") { + return ActivationType::kTanh; + } else if (type == "identity" || type == "") { + return ActivationType::kIdentity; + } + PADDLE_THROW("Not support type %s.", type); +} + namespace forward { template diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h index a734ad31eea4816e952641bad73776d93d8c8d34..42888fcdb0a464892e3007ee73c195fcd2a431bb 100644 --- a/paddle/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/operators/math/detail/lstm_cpu_kernel.h @@ -26,10 +26,9 @@ namespace detail { template void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { T r_value_in; T r_value_ig; T r_value_fg; @@ -77,9 +76,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, template void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { T r_value_in; T r_value_ig; T r_value_fg; @@ -149,10 +148,9 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, template void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { #ifdef __AVX__ __m256 r_value_in; __m256 r_value_ig; @@ -204,9 +202,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, template void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { #ifdef __AVX__ __m256 r_value_in; __m256 r_value_ig; @@ -281,9 +279,8 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, template void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { avx_lstm_forward_one_sequence(op, value, frame_size, active_node, active_gate, active_state); @@ -295,9 +292,9 @@ void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, template void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, - int frame_size, activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + int frame_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { avx_lstm_backward_one_sequence(op, value, grad, frame_size, active_node, active_gate, active_state); diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h index 91bfedea53a2600156c9025f6ff3615d695a712b..e31e657e8b6964c2b99f6e456545c83d8da8e7f9 100644 --- a/paddle/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/operators/math/detail/lstm_gpu_kernel.h @@ -31,9 +31,9 @@ namespace detail { */ template __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, - int batch_size, activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; @@ -91,9 +91,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, template __global__ void KeLstmBackward(Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, - int batch_size, activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + int batch_size, ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; @@ -185,9 +185,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, template void gpu_lstm_forward(const platform::DeviceContext& context, Op op, LstmMetaValue value, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { dim3 threads; dim3 grid; if (batch_size == 1) { @@ -220,9 +219,8 @@ template void gpu_lstm_backward(const platform::DeviceContext& context, Op op, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, ActivationType active_gate, + ActivationType active_state) { dim3 threads; dim3 grid; if (batch_size == 1) { diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h index 78f9a249a3d5d413452952edf990975c02f1a369..fed8f9c4ca48905ad4c524ba400e8c7bb2f7fbd1 100644 --- a/paddle/operators/math/detail/lstm_kernel.h +++ b/paddle/operators/math/detail/lstm_kernel.h @@ -30,9 +30,9 @@ class lstm { HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, T &prev_state, T &state, T &state_atv, T &output, T &checkI, T &checkF, T &checkO, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { value_in = activation(value_in, active_node); value_ig = activation(value_ig + prev_state * checkI, active_gate); value_fg = activation(value_fg + prev_state * checkF, active_gate); @@ -53,9 +53,9 @@ class lstm { __m256 &prev_state, __m256 &state, __m256 &state_atv, __m256 &output, __m256 &checkI, __m256 &checkF, __m256 &checkO, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { value_in = activation(value_in, active_node); value_ig = activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), @@ -87,9 +87,9 @@ class lstm { T &state_grad, T &state_atv, T &output_grad, T &checkI, T &checkF, T &checkO, T &checkIGrad, T &checkFGrad, T &checkOGrad, - activation_mode_t active_node, - activation_mode_t active_gate, - activation_mode_t active_state) { + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { grad_og = activation(output_grad * state_atv, value_og, active_gate); state_grad += activation(output_grad * value_og, state_atv, active_state) + grad_og * checkO; @@ -114,8 +114,8 @@ class lstm { __m256 &prev_state, __m256 &prev_state_grad, __m256 &state, __m256 &state_grad, __m256 &state_atv, __m256 &output_grad, __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, - __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node, - activation_mode_t active_gate, activation_mode_t active_state) { + __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, active_gate); state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc index 2c2e8bb82e6f51e21a00de53bbfce5f0b4868e27..d453102ecefc9d79e1f4474ba94be0eb69a87c85 100644 --- a/paddle/operators/math/lstm_compute.cc +++ b/paddle/operators/math/lstm_compute.cc @@ -24,12 +24,12 @@ template struct LstmUnitFunctor { static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, - const std::string& gate_act, const std::string& cell_act, - const std::string& cand_act) { + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { for (int b = 0; b < batch_size; b++) { detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, - ActiveType(cand_act), ActiveType(gate_act), - ActiveType(cell_act)); + cand_act, gate_act, cell_act); value.gate_value += frame_size * 4; value.state_value += frame_size; value.state_active_value += frame_size; @@ -46,12 +46,12 @@ struct LstmUnitGradFunctor { static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, - const std::string& gate_act, const std::string& cell_act, - const std::string& cand_act) { + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { for (int b = 0; b < batch_size; b++) { detail::cpu_lstm_backward(detail::backward::lstm(), value, grad, - frame_size, ActiveType(cand_act), - ActiveType(gate_act), ActiveType(cell_act)); + frame_size, cand_act, gate_act, cell_act); value.gate_value += frame_size * 4; value.state_value += frame_size; diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu index 92b1f4228b49709d2903fab518e7649133932fad..82065d699f760db6cc86bf3d6c56e51c583c6ace 100644 --- a/paddle/operators/math/lstm_compute.cu +++ b/paddle/operators/math/lstm_compute.cu @@ -24,11 +24,12 @@ template struct LstmUnitFunctor { static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, - const std::string& gate_act, const std::string& cell_act, - const std::string& cand_act) { + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { detail::gpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, batch_size, ActiveType(cand_act), - ActiveType(gate_act), ActiveType(cell_act)); + frame_size, batch_size, cand_act, gate_act, + cell_act); } }; @@ -37,11 +38,12 @@ struct LstmUnitGradFunctor { static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, - const std::string& gate_act, const std::string& cell_act, - const std::string& cand_act) { + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, - frame_size, batch_size, ActiveType(cand_act), - ActiveType(gate_act), ActiveType(cell_act)); + frame_size, batch_size, cand_act, gate_act, + cell_act); } }; diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 5f74e273585aea5184281bf294df694235150e30..954762f92286fe13bd2c08ec03c3ac96bb663cca 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" @@ -72,8 +73,9 @@ class LstmUnitFunctor { public: static void compute(const DeviceContext &context, LstmMetaValue value, int frame_size, int batch_size, - const std::string &gate_act, const std::string &cell_act, - const std::string &cand_act); + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); }; template @@ -81,8 +83,9 @@ class LstmUnitGradFunctor { public: static void compute(const DeviceContext &context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, - const std::string &gate_act, const std::string &cell_act, - const std::string &cand_act); + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); }; } // namespace math diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h new file mode 100644 index 0000000000000000000000000000000000000000..6ba6b01076103cf5660718b32a1989c14bc6dd70 --- /dev/null +++ b/paddle/platform/for_range.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace platform { + +template +struct ForRange { + ForRange(const DeviceContext& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +#ifdef __NVCC__ +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, int limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + + template + inline void operator()(Function func) const { + constexpr size_t num_threads = 1024; + int block_size = limit_ <= num_threads ? limit_ : num_threads; + int grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const CUDADeviceContext& dev_ctx_; + int limit_; +}; + +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 668a48e816919cbc6a4bb646adedfbbdcb57922a..07e38476e68b79f5b3192c619c89cd0e061cc686 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -78,6 +78,10 @@ PYBIND11_PLUGIN(core) { [](Tensor &self, const std::vector &dim) { self.Resize(make_ddim(dim)); }) + .def("set_layout", + [](Tensor &self, const std::string &layout) { + self.set_layout(StringToDataLayout(layout)); + }) .def("alloc_float", [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 27c82c95f79e0a3e3129627bfa33d85e0d3cd862..44a6e344630bb35d28ee29078bf8727053a24bef 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,7 +14,7 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' ] from threading import Thread @@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"): return lines[:-1], lines[-1] -def pipe_reader(left_cmd, - parser, - bufsize=8192, - file_type="plain", - cut_lines=True, - line_break="\n"): +class PipeReader: """ - pipe_reader read data by stream from a command, take it's - stdout into a pipe buffer and redirect it to the parser to - parse, then yield data as your desired format. + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. - You can using standard linux command or call another program - to read data, from HDFS, Ceph, URL, AWS S3 etc: + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: - cmd = "hadoop fs -cat /path/to/some/file" - cmd = "cat sample_file.tar.gz" - cmd = "curl http://someurl" - cmd = "python print_s3_bucket.py" + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" - A sample parser: + An example: + + .. code-block:: python - def sample_parser(lines): - # parse each line as one sample data, - # return a list of samples as batches. - ret = [] - for l in lines: - ret.append(l.split(" ")[1:5]) - return ret - - :param left_cmd: command to excute to get stdout from. - :type left_cmd: string - :param parser: parser function to parse lines of data. - if cut_lines is True, parser will receive list - of lines. - if cut_lines is False, parser will receive a - raw buffer each time. - parser should return a list of parsed values. - :type parser: callable - :param bufsize: the buffer size used for the stdout pipe. - :type bufsize: int - :param file_type: can be plain/gzip, stream buffer data type. - :type file_type: string - :param cut_lines: whether to pass lines instead of raw buffer - to the parser - :type cut_lines: bool - :param line_break: line break of the file, like \n or \r - :type line_break: string - - :return: the reader generator. - :rtype: callable + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample """ - if not isinstance(left_cmd, str): - raise TypeError("left_cmd must be a string") - if not callable(parser): - raise TypeError("parser must be a callable object") - - # TODO(typhoonzero): add a thread to read stderr - - # Always init a decompress object is better than - # create in the loop. - dec = zlib.decompressobj( - 32 + zlib.MAX_WBITS) # offset 32 to skip the header - def reader(): - process = subprocess.Popen( - left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ remained = "" while True: - buff = process.stdout.read(bufsize) + buff = self.process.stdout.read(self.bufsize) if buff: - if file_type == "gzip": - decomp_buff = dec.decompress(buff) - elif file_type == "plain": + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": decomp_buff = buff else: - raise TypeError("file_type %s is not allowed" % file_type) + raise TypeError("file_type %s is not allowed" % + self.file_type) if cut_lines: lines, remained = _buf2lines(''.join( [remained, decomp_buff]), line_break) - parsed_list = parser(lines) - for ret in parsed_list: - yield ret + for line in lines: + yield line else: - for ret in parser(decomp_buff): - yield ret + yield decomp_buff else: break - - return reader diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 06e14796daf27812b9aeb1e4b024f294c7609991..4ba71969dffe7447b6c5b70aeb752a4e5469fb36 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase): class TestPipeReader(unittest.TestCase): def test_pipe_reader(self): - def simple_parser(lines): - return lines + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l import tempfile @@ -159,17 +162,12 @@ class TestPipeReader(unittest.TestCase): for r in records: f.write('%s\n' % r) - cmd = "cat %s" % temp.name - reader = paddle.v2.reader.pipe_reader( - cmd, simple_parser, bufsize=128) - for i in xrange(4): - result = [] - for r in reader(): - result.append(r) - - for idx, e in enumerate(records): - print e, result[idx] - self.assertEqual(e, result[idx]) + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) finally: # delete the temporary file temp.close()