diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh index da034f3b9dff794e22086a5295ad2b0c2361c356..71a49231a5527ebee9f45d5f4650ce2a4f6a1c31 100755 --- a/benchmark/paddle/image/run_openblas_infer.sh +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -8,15 +8,19 @@ function clock_to_seconds() { } function infer() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY topology=$1 layer_num=$2 bs=$3 - thread=`nproc` - if [ $thread -gt $bs ]; then - thread=$bs + trainers=`nproc` + if [ $trainers -gt $bs ]; then + trainers=$bs fi - log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log" + log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log" + threads=$((`nproc` / trainers)) + if [ $threads -eq 0 ]; then + threads=1 + fi + export OPENBLAS_NUM_THREADS=$threads models_in="models/${topology}-${layer_num}/pass-00000/" if [ ! -d $models_in ]; then @@ -28,7 +32,7 @@ function infer() { --config="${topology}.py" \ --use_mkldnn=False \ --use_gpu=False \ - --trainer_count=$thread \ + --trainer_count=$trainers \ --log_period=$log_period \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ --init_model_path=$models_in \ diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index e9df83fee2a3f796b7234b39619364f6ee4d5dc9..935cff6f2c97d25d6de556cfee25e27dbe49b5b6 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -1,7 +1,7 @@ set -e function train() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + export OPENBLAS_NUM_THREADS=1 topology=$1 layer_num=$2 bs=$3 diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d81481ca819c13ee0e299c204f998f3915c34bd4..ddf0b055a92d80295b24255a5462d477e0d9c796 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -252,6 +252,11 @@ first_seq .. autoclass:: paddle.v2.layer.first_seq :noindex: +sub_seq +--------- +.. autoclass:: paddle.v2.layer.sub_seq + :noindex: + concat ------ .. autoclass:: paddle.v2.layer.concat diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 939731c0f3438a702e947ba1a7abeb5e3e6a8f53..004ee2d8c85ce7661886179570e693d7d61bc6d8 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -68,12 +68,6 @@ scale :noindex: -reshape ---------- -.. autofunction:: paddle.v2.fluid.layers.reshape - :noindex: - - transpose --------- .. autofunction:: paddle.v2.fluid.layers.transpose diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index 202b4b65103c0b7c536a9cb466c4120ce134d8c3..691081c268b848811bf5ee6d6a41edfe0f47eec0 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -79,7 +79,7 @@ class Optimizer(object): def minimize(self, loss, parameter_list): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ params_grads = self.create_backward_pass(loss, parameter_list) diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index fa1b6a372728ccac128d2e6e79a6514b8884ea3f..bae42593ddc6f7a7eb47d603752ad6efa9820b45 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -15,7 +15,7 @@ 获取PaddlePaddle的Docker镜像 ------------------------------ -执行下面的命令获取最新的PaddlePaddle Docker镜像 +执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl: .. code-block:: bash @@ -27,7 +27,7 @@ docker pull docker.paddlepaddle.org/paddle -下载GPU版本的Docker镜像: +下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: .. code-block:: bash @@ -54,7 +54,7 @@ .. _docker_run: 在Docker中执行PaddlePaddle训练程序 ------------------------------- +---------------------------------- 假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 `PaddlePaddleBook `_ @@ -82,7 +82,7 @@ .. _docker_run_book: 使用Docker启动PaddlePaddle Book教程 ------------------------------- +----------------------------------- 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 06012bf65e75c32957516f6b7f62e09480871b84..56a7c68e4d39c45249fa55a964dc48b7081596a6 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps. Pull PaddlePaddle Docker Image ------------------------------ -Run the following command to download the latest Docker images: +Run the following command to download the latest Docker images, the version is cpu_avx_mkl: .. code-block:: bash @@ -28,7 +28,7 @@ For users in China, we provide a faster mirror: docker pull docker.paddlepaddle.org/paddle -Download GPU version images: +Download GPU version (cuda8.0_cudnn5_avx_mkl) images: .. code-block:: bash @@ -58,7 +58,7 @@ and run: .. _docker_run: Launch your training program in Docker ------------------------------- +-------------------------------------- Assume that you have already written a PaddlePaddle program named :code:`train.py` under directory :code:`/home/work` (refer to diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index a4587f82a984acf243f49834e707fcd66d5b1252..0c741e936b46eda5e7165e4ee54b545b14a28a19 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具 ------------------------------ -执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 +执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas。 .. code-block:: bash pip install paddlepaddle -如果需要安装支持GPU的版本,需要执行: +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: .. code-block:: bash diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 55e31560a0f5087ab69966a6281c6c8573c04204..285ed09805b09790beaef014f6813c227aff33ac 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -12,14 +12,14 @@ Install Using pip ------------------------------ Run the following command to install PaddlePaddle on the current -machine, it will also download requirements. +machine, it will also download requirements, the version is cpu_avx_openblas. .. code-block:: bash pip install paddlepaddle -If you wish to install GPU version, just run: +If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run: .. code-block:: bash diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index a9087be6f350c5656cabb0c64ba0f200d1c666cc..9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -7,13 +7,13 @@ ++++++++ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 -执行下面的命令完成快速安装: +执行下面的命令完成快速安装,版本为cpu_avx_openblas: .. code-block:: bash pip install paddlepaddle -如果需要安装支持GPU的版本,需要执行: +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: .. code-block:: bash diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index d14e3f5c0cc90792fce9cb82e65da482c44dc433..063d9d880c82550f7f5d47d3d0b1fff59865bca7 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -8,13 +8,13 @@ Quick Install You can use pip to install PaddlePaddle with a single command, supports CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. -Simply run the following command to install: +Simply run the following command to install, the version is cpu_avx_openblas: .. code-block:: bash pip install paddlepaddle -If you need to install GPU version, run: +If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: .. code-block:: bash diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 738684795d8170ffd5c5b2bf19e6e150219332d0..6788cb34fbaf5941cbb1537c7a83577c623bf76a 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) +if (WITH_GPU) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) +else() + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) +endif () cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) -cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +if (WITH_GPU) + nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) +else() + cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 35f16025a9ae44bd70e15b19b25deb08299bea88..376268888e70b0a70060c81384f79f8bf5d6dcc5 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/data_transform.h" +#include "paddle/framework/lod_tensor.h" namespace paddle { namespace framework { diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index 73f894a3e20ab779f8607e63a67139b0e8cce79a..bd6d301c12e0611c5b01c3ff58869dbeb96b268e 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -27,9 +27,8 @@ limitations under the License. */ namespace paddle { namespace framework { -using DataTransformFN = - std::function ctx, - const Variable& in, Variable* out)>; +using DataTransformFn = std::function; using KernelTypePair = std::pair; struct KernelTypePairHash { @@ -47,7 +46,7 @@ struct KernelTypePairHash { }; using DataTransformMap = - std::unordered_map; + std::unordered_map; class DataTransformFnMap { public: @@ -58,25 +57,25 @@ class DataTransformFnMap { } void Insert(const OpKernelType& left, const OpKernelType& right, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { Insert(std::make_pair(left, right), data_tranform_fn); } void Insert(const KernelTypePair& kernel_type_pair, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { PADDLE_ENFORCE(!Has(kernel_type_pair), "KernelTypePair %s has been registered", ""); map_.insert({kernel_type_pair, data_tranform_fn}); } - const DataTransformFN& Get(const KernelTypePair& key_pair) const { + const DataTransformFn& Get(const KernelTypePair& key_pair) const { auto data_transformer = GetNullable(key_pair); PADDLE_ENFORCE_NOT_NULL(data_transformer, - "DataTransformFN should not be NULL"); + "DataTransformFn should not be NULL"); return *data_transformer; } - const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const { + const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const { auto it = map_.find(key_pair); if (it == map_.end()) { return nullptr; diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index f93a47eeb567c4fc984954aa5198362c9939c556..5f05e881fa16eead1dc690f85375706bf3cd3e6d 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -11,36 +11,61 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include -#include "paddle/framework/data_transform.h" #include +#include "paddle/framework/data_transform.h" + namespace paddle { namespace framework { - using namespace platform; +/** + * @brief cross validation of different kernel type transform + * We use four bit map represent different combination. + * If the field has multiple possible value, only choose two of them. + * For DataType, only test the FP32(float), FP64(double). + * e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain + * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN + */ + +std::array kDataType = { + {proto::DataType::FP32, proto::DataType::FP64}}; + +std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; + +std::array kDataLayout = { + {DataLayout::kNHWC, DataLayout::kNCHW}}; + +std::array kLibraryType = { + {LibraryType::kPlain, LibraryType::kMKLDNN}}; + +OpKernelType GenFromBit(const std::vector bits) { + return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], + kLibraryType[bits[3]]); +} + int test_value = 0; -OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW, - LibraryType::kCUDNN); -OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0), - DataLayout::kNCHW, LibraryType::kCUDNN); -OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0), - DataLayout::kNCHW, LibraryType::kCUDNN); +auto kernel0 = GenFromBit({0, 0, 0, 0}); +auto kernel1 = GenFromBit({0, 0, 0, 1}); +auto kernel2 = GenFromBit({0, 0, 1, 0}); +auto kernel3 = GenFromBit({0, 0, 1, 1}); -void type1_to_type2(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value++; } -void type2_to_type3(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value--; } -void type1_to_type3(std::vector ctx, - const Variable& in, Variable* out) { +void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value += 2; } @@ -49,12 +74,9 @@ void type1_to_type3(std::vector ctx, namespace frw = paddle::framework; -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2, - frw::type1_to_type2); -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3, - frw::type2_to_type3); -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3, - frw::type1_to_type3); +REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t); +REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t); +REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t); TEST(DataTransform, Register) { using namespace paddle::framework; @@ -62,17 +84,16 @@ TEST(DataTransform, Register) { auto& instance = DataTransformFnMap::Instance(); ASSERT_EQ(instance.Map().size(), 3UL); - std::vector ctx; + DeviceContext* ctx = nullptr; paddle::framework::Variable in; paddle::framework::Variable out; - instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in, - &out); + instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out); ASSERT_EQ(test_value, 1); - instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in, - &out); + + instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out); ASSERT_EQ(test_value, 0); - instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in, - &out); + + instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out); ASSERT_EQ(test_value, 2); } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 997773c1689efad4ce5a86c09ce58bd3a40185e0..bf1f0471ccbfccf13cb6f74c8088da7acd68ec0b 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -14,18 +14,17 @@ limitations under the License. */ #include "paddle/framework/executor.h" -#include -#include -#include #include -#include +#include "gflags/gflags.h" #include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/lod_rank_table.h" -#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" -#include "paddle/framework/scope.h" + +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); namespace paddle { namespace framework { @@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { } } +static void CheckTensorNANOrInf(const std::string& name, + const framework::Tensor& tensor) { + if (tensor.memory_size() == 0) { + return; + } + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { + return; + } + PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { // TODO(tonyyang-svail): @@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); VLOG(3) << op->DebugString(); op->Run(*local_scope, place_); + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } } - if (create_local_scope) { + if (create_vars && create_local_scope) { scope->DeleteScope(local_scope); } } diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index d6601090d5b6150a5aa467210038d3693c3e67a8..682cff168d4d31e0565fc987604f97a671566fbd 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -71,7 +71,7 @@ bool InitDevices(const std::vector &devices) { places.emplace_back(platform::CPUPlace()); LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - platform::DeviceContextPool::Create(places); + platform::DeviceContextPool::Init(places); return true; } diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index f8a3be9a82bdbaf82550634d36122eb7bbe85e54..7b6dc09bdb5535488c8c4dbc71c9cd6a7998bd0b 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { - // TODO(typhoonzero): serialize to ostream - { // the 1st field, uint32_t version + { // the 1st field, uint32_t version for LoDTensor constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); } - { // the 2nd field, tensor description - // int32_t size - // void* protobuf message - proto::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); - auto dims = framework::vectorize(tensor.dims()); - auto *pb_dims = desc.mutable_dims(); - pb_dims->Resize(static_cast(dims.size()), 0); - std::copy(dims.begin(), dims.end(), pb_dims->begin()); - int32_t size = desc.ByteSize(); - os.write(reinterpret_cast(&size), sizeof(size)); - auto out = desc.SerializeAsString(); - os.write(out.data(), size); - } - { // the 3rd field, tensor data - uint64_t size = tensor.memory_size(); - auto *data_ptr = tensor.data(); - PADDLE_ENFORCE(size < std::numeric_limits::max(), - "Index overflow when writing tensor"); - if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA - constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB - std::unique_ptr buf(new char[kBufSize]); - auto &gpu_dev_ctx = - static_cast(dev_ctx); - platform::CPUPlace cpu; - uintptr_t data = reinterpret_cast(data_ptr); - while (size != 0) { - size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), - reinterpret_cast(data), size_to_write, - gpu_dev_ctx.stream()); - gpu_dev_ctx.Wait(); - os.write(buf.get(), size_to_write); - data += size_to_write; - size -= size_to_write; - } -#else - PADDLE_THROW("Unexpected branch"); -#endif - } else { - os.write(static_cast(data_ptr), - static_cast(size)); - } - } - { // the 4th field, lod information - // uint64_t lod_level - // uint64_t lod_level_1 size in byte. - // int* lod_level_1 data - // ... + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... auto lod = tensor.lod(); uint64_t size = lod.size(); os.write(reinterpret_cast(&size), sizeof(size)); @@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, static_cast(size)); } } + // the 3st field, Tensor + SerializeToStream(os, static_cast(tensor), dev_ctx); } void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { - uint32_t version; - is.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); - proto::TensorDesc desc; - { // int32_t size - // proto buffer - int32_t size; - is.read(reinterpret_cast(&size), sizeof(size)); - std::unique_ptr buf(new char[size]); - is.read(reinterpret_cast(buf.get()), size); - PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), - "Cannot parse tensor desc"); - } - { // read tensor - std::vector dims; - dims.reserve(static_cast(desc.dims().size())); - std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); - tensor->Resize(framework::make_ddim(dims)); - - void *buf; - platform::Place cpu = platform::CPUPlace(); - switch (desc.data_type()) { - case proto::FP32: - buf = tensor->mutable_data(cpu); - break; - case proto::FP64: - buf = tensor->mutable_data(cpu); - break; - case proto::INT32: - buf = tensor->mutable_data(cpu); - break; - case proto::INT64: - buf = tensor->mutable_data(cpu); - break; - default: - PADDLE_THROW("DataType %d not supported", desc.data_type()); - } - is.read(static_cast(buf), tensor->memory_size()); - } - { // read lod + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, LoD information uint64_t lod_level; is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); auto &lod = *tensor->mutable_lod(); @@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { lod[i] = tmp; } } + // the 3st filed, Tensor + DeserializeFromStream(is, static_cast(tensor)); } } // namespace framework diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc index 02d84b68233f2fdfc66e1df2fc7ce20307cadd94..0747c8db531d6ae443d76591b945cce0c9bbea2b 100644 --- a/paddle/framework/lod_tensor_test.cc +++ b/paddle/framework/lod_tensor_test.cc @@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) { EXPECT_NE(t1.data(), lod_tensor_.data()); } +TEST_F(LoDTensorTester, SerializeAndDeserialize) { + LoDTensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + std::ostringstream oss; + SerializeToStream(oss, lod_tensor_, cpu_ctx); + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + float* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < kLodTensorSize; ++i) { + EXPECT_EQ(dst_ptr[i], i); + } + EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod()); +} + TEST(LodExpand, test) { LoD lod{{0, 2}}; LoDTensor tensor; diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index b361e64438251c1df827667fb825e7f5909fb09e..781bbb4c19f1c610df485c3061ca8b510e727019 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, need_update_ = true; } +void OpDesc::CopyFrom(const OpDesc &op_desc) { + desc_.set_type(op_desc.Type()); + inputs_ = op_desc.inputs_; + outputs_ = op_desc.outputs_; + attrs_ = op_desc.attrs_; + need_update_ = true; +} + OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) : desc_(desc), need_update_(false) { // restore inputs_ diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index 93d4a88f3c390551ab41e42ec2f6f30f52e306db..4cf784a0d0d319d09caa27b4e2b589bd7ac4f324 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -35,6 +35,8 @@ class OpDesc { OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); + void CopyFrom(const OpDesc &op_desc); + proto::OpDesc *Proto(); std::string Type() const { return desc_.type(); } diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index 97b542e345feab0bab701dd967558ce23375dc7f..b06002096fb109da806809f7b908d9768cf095ba 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -68,6 +68,8 @@ struct OpKernelType { data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && library_type_ == o.library_type_; } + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } }; inline std::ostream& operator<<(std::ostream& os, @@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os, return os; } +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index dd048405007974667bbb8a052b77ab8b3aa4580e..649afeee8a846b0579545f2edff77e9dbe3b4dd8 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, LibraryType::kCUDNN); - std::ostringstream stream; - stream << op_kernel_type; ASSERT_EQ( - stream.str(), + paddle::framework::KernelTypeToString(op_kernel_type), "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 886f73e7b81c35cac573bd041e6462eb2111bf85..a3ce96c409675ad52a811586c736ca22b5c7e99e 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext { const Scope& scope_; }; +const platform::DeviceContext* GetDeviceContext( + framework::KernelTypePair& kernel_pair) { + auto& actual_kernel_key = kernel_pair.first; + auto& expected_kernel_key = kernel_pair.second; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + if (platform::is_gpu_place(actual_kernel_key.place_) && + platform::is_cpu_place(expected_kernel_key.place_)) { + return pool.Get(actual_kernel_key.place_); + } else if (platform::is_cpu_place(actual_kernel_key.place_) && + platform::is_gpu_place(expected_kernel_key.place_)) { + return pool.Get(expected_kernel_key.place_); + } else { + PADDLE_THROW( + "Currently, model parallelism is only supported between CPU and CUDA"); + } +} + void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto dev_ctx = pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); @@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope, } if (actual_kernel_key == expected_kernel_key) { - kernel_iter->second->Compute(ctx); + PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_, + "Currently, model parallelism is only supported between " + "CPU and other devices. For example, multi-GPU model " + "parallelism will failed."); } else { - Scope& op_scope = scope.NewScope(); - auto input_vars = this->InputVars(); - for (auto var_name : input_vars) { - op_scope.Var(var_name); - } - - // TODO(qijun) get appropriate DeviceContext from DeviceContext pool - platform::DeviceContext* trans_dev_ctx = nullptr; - std::vector trans_dev_ctx_vec{trans_dev_ctx}; + auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key); + const DataTransformFn* trans_fun = + DataTransformFnMap::Instance().GetNullable(kernel_pair); + if (trans_fun) { + auto input_vars = this->InputVars(); + // TODO(qijun) filter the input vars that do not need to be transformed + + // filter vars that has been transformed + std::vector need_trans; + for (auto var_name : input_vars) { + auto var_name_trans = + var_name + framework::KernelTypeToString(expected_kernel_key); + if (!scope.FindVar(var_name_trans)) { + const_cast(scope).Var(var_name_trans); + need_trans.push_back(var_name); + } + } - // TODO(qijun) get appropriate DataTransformFN from global map - framework::DataTransformFN trans_fun = nullptr; + if (!need_trans.empty()) { + auto trans_dev_ctx = GetDeviceContext(kernel_pair); - // Wait for transform starting - dev_ctx->Wait(); + // Wait for transform starting + dev_ctx->Wait(); - for (auto var_name : input_vars) { - trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)), - op_scope.FindVar(var_name)); - } - // Wait for data transform finishing - for (auto ctx : trans_dev_ctx_vec) { - ctx->Wait(); + for (auto var_name : need_trans) { + (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)), + scope.FindVar(var_name + framework::KernelTypeToString( + expected_kernel_key))); + } + // Wait for data transform finishing + trans_dev_ctx->Wait(); + } } - - // Create a new ExecutionContext - ExecutionContext op_ctx(*this, op_scope, *dev_ctx); - kernel_iter->second->Compute(op_ctx); } + + kernel_iter->second->Compute(ctx); } OpKernelType OperatorWithKernel::GetActualKernelType( diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc index c74459c9dd7006a24615b1d6df041583088fb25c..82adfa7123a3cf40d929021602c45fe7d2e34ffa 100644 --- a/paddle/framework/selected_rows.cc +++ b/paddle/framework/selected_rows.cc @@ -12,5 +12,58 @@ limitations under the License. */ #include "paddle/framework/selected_rows.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + SerializeToStream(os, selected_rows.value(), dev_ctx); +} + +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) { + auto tensor = *selected_rows->mutable_value(); + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + DeserializeFromStream(is, &tensor); +} + +} // namespace framework } // namespace paddle diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h index 0332b91323e3a4b4b80e02302ad3dcafe0986cde..699e392688e9889f050592172f8bfc45f855d0b1 100644 --- a/paddle/framework/selected_rows.h +++ b/paddle/framework/selected_rows.h @@ -59,5 +59,14 @@ class SelectedRows { int64_t height_; }; +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows); + } // namespace framework } // namespace paddle diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc index 4ee13a65d72e44693573397bb686b355effb2227..75487c4010391aa9e519d73058184fa936dabb84 100644 --- a/paddle/framework/selected_rows_test.cc +++ b/paddle/framework/selected_rows_test.cc @@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) { ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); } +TEST_F(SelectedRowsTester, SerializeAndDeseralize) { + SelectedRows dst_tensor; + platform::CPUDeviceContext cpu_ctx(place_); + std::ostringstream oss; + + SerializeToStream(oss, *selected_rows_, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + + ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); + ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index b9f6884f7c4b6eeacda722762d485dea97fdcb63..341a6949beeb2dfa64b23d2079bd8f48750a94f8 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -178,7 +178,7 @@ class Tensor { DDim dims_; /** - * @brief the layout of memory block, default is NCHW. + * @brief the layout of memory block, default is NHWC. * * @note the memory allocation order, describe how weight/data is stored * For example, in 4-D Tensor(rank=4), there are three commonly diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index ca76a9fcb9079bab22f7b192c45903852c91797f..a1b4a03289eca4c8b9d8c23ede4221853cb31f79 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -15,12 +15,13 @@ #include #include +namespace framework = paddle::framework; +namespace platform = paddle::platform; + TEST(Tensor, Dims) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor tt; + framework::Tensor tt; tt.Resize({2, 3, 4}); - DDim dims = tt.dims(); + framework::DDim dims = tt.dims(); ASSERT_EQ(arity(dims), 3); for (int i = 0; i < 3; ++i) { EXPECT_EQ(i + 2, dims[i]); @@ -28,12 +29,12 @@ TEST(Tensor, Dims) { } TEST(Tensor, DataAssert) { - paddle::framework::Tensor src_tensor; + framework::Tensor src_tensor; bool caught = false; try { src_tensor.data(); - } catch (paddle::platform::EnforceNotMet err) { + } catch (platform::EnforceNotMet err) { caught = true; std::string msg = "holder_ should not be null\nTensor holds no memory. Call " @@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) { because Memory::Alloc() and Memory::Free() have not been ready. */ TEST(Tensor, MutableData) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; + framework::Tensor src_tensor; float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CPUPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CPUPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), CPUPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CPUPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CPUPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CPUPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); EXPECT_EQ(p1, p2); } #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; + framework::Tensor src_tensor; float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CUDAPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CUDAPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), CUDAPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CUDAPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CUDAPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CUDAPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), CUDAPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CUDAPlace()); EXPECT_EQ(p1, p2); } #endif } TEST(Tensor, ShareDataWith) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; - Tensor dst_tensor; + framework::Tensor src_tensor; + framework::Tensor dst_tensor; // Try to share data form uninitialized tensor bool caught = false; try { @@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) { } ASSERT_TRUE(caught); - src_tensor.mutable_data(make_ddim({2, 3, 4}), CPUPlace()); + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CPUPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; - Tensor dst_tensor; - src_tensor.mutable_data(make_ddim({2, 3, 4}), CUDAPlace()); + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CUDAPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) { } TEST(Tensor, Slice) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; - src_tensor.mutable_data(make_ddim({5, 3, 4}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 3); - DDim slice_dims = slice_tensor.dims(); + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({5, 3, 4}), + platform::CPUPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(1, 3); + framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 3); EXPECT_EQ(slice_dims[0], 2); EXPECT_EQ(slice_dims[1], 3); @@ -153,11 +159,12 @@ TEST(Tensor, Slice) { uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), CPUPlace())); + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); - uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), CPUPlace())); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CPUPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); @@ -165,22 +172,25 @@ TEST(Tensor, Slice) { #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; - src_tensor.mutable_data(make_ddim({6, 9}), CUDAPlace()); - Tensor slice_tensor = src_tensor.Slice(2, 6); - DDim slice_dims = slice_tensor.dims(); + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::CUDAPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); EXPECT_EQ(slice_dims[0], 4); EXPECT_EQ(slice_dims[1], 9); uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); - uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), CUDAPlace())); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); - uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), CUDAPlace())); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); @@ -189,23 +199,19 @@ TEST(Tensor, Slice) { } TEST(Tensor, ReshapeToMatrix) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor src; - int* src_ptr = src.mutable_data({2, 3, 4, 9}, CPUPlace()); + framework::Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, platform::CPUPlace()); for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { src_ptr[i] = i; } - Tensor res = ReshapeToMatrix(src, 2); + framework::Tensor res = framework::ReshapeToMatrix(src, 2); ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } TEST(Tensor, Layout) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor src; - ASSERT_EQ(src.layout(), DataLayout::kNHWC); - src.set_layout(DataLayout::kAnyLayout); - ASSERT_EQ(src.layout(), DataLayout::kAnyLayout); + framework::Tensor src; + ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + src.set_layout(framework::DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..7efc649d0bcda67c663d148e83bcbb6789b0f371 --- /dev/null +++ b/paddle/framework/tensor_util.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + CopyFrom(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu new file mode 120000 index 0000000000000000000000000000000000000000..b00e6e59d93328bf3142597ea4de0dc225501e56 --- /dev/null +++ b/paddle/framework/tensor_util.cu @@ -0,0 +1 @@ +./tensor_util.cc \ No newline at end of file diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index ea4e4f22ea82ccc9f8b683d2fd773a7bc37f78a3..6a21f8db1e3966fd23eee0da2346b2d61f9321fb 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -13,7 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/data_type.h" +#include "paddle/framework/eigen.h" +#include "paddle/framework/framework.pb.h" #include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +// Returns true if a tensor contains NAN, i.e., Not A Number. +bool HasNAN(const framework::Tensor& tensor); + +// Returns true if a tensor contains Inf, i.e., Infinity. +bool HasInf(const framework::Tensor& tensor); + +inline void SerializeToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + // TODO(typhoonzero): serialize to ostream + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.memory_size(); + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +inline void DeserializeFromStream(std::istream& is, Tensor* tensor) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + + void* buf; + platform::Place cpu = platform::CPUPlace(); + // TODO(Yancey1989): use VisiterDataType instead of DataType switch + switch (desc.data_type()) { + case proto::FP32: + buf = tensor->mutable_data(cpu); + break; + case proto::FP64: + buf = tensor->mutable_data(cpu); + break; + case proto::INT32: + buf = tensor->mutable_data(cpu); + break; + case proto::INT64: + buf = tensor->mutable_data(cpu); + break; + default: + PADDLE_THROW("DataType %d not supported", desc.data_type()); + } + is.read(static_cast(buf), tensor->memory_size()); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index f388c19f28ed28335818733f946d8eaf18464627..0dc5166fcabf77b48b8681ab1f050e2bc88f44ab 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -13,6 +13,7 @@ #include "paddle/framework/tensor_util.h" #include +#include #include namespace paddle { @@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) { #endif } +TEST(HasNAN, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + + ASSERT_TRUE(HasNAN(src)); +} + +TEST(HasInf, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(HasInf(src)); +} + +TEST(Tensor, SerializeAndDeserialize) { + framework::Tensor src_tensor; + int array[6] = {1, 2, 3, 4, 5, 6}; + src_tensor.Resize({2, 3}); + int* src_ptr = src_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + src_ptr[i] = array[i]; + } + { + framework::Tensor dst_tensor; + auto place = new platform::CPUPlace(); + platform::CPUDeviceContext cpu_ctx(*place); + std::ostringstream oss; + SerializeToStream(oss, src_tensor, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 5; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + delete place; + } +#ifdef PADDLE_WITH_CUDA + { + Tensor gpu_tensor; + gpu_tensor.Resize({2, 3}); + Tensor dst_tensor; + + auto gpu_place = new platform::CUDAPlace(); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + + CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + std::ostringstream oss; + SerializeToStream(oss, gpu_tensor, gpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + + delete gpu_place; + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..ebd35fdf6c2a1388fec23057070f723c8ef9da9c --- /dev/null +++ b/paddle/framework/tensor_util_test.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/framework/tensor_util.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace framework { + +static __global__ void FillNAN(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = NAN; +} +static __global__ void FillInf(float* buf) { + buf[0] = 0.0; + buf[1] = INFINITY; + buf[2] = 0.5; +} + +TEST(HasNAN, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasNAN(tensor)); +} + +TEST(HasInf, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasInf(tensor)); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h index 5f6b2d458f7ee764c22d203f285b78023b6012f3..bcd8190755083ec30687675602a1c95a9c15c69e 100644 --- a/paddle/framework/threadpool.h +++ b/paddle/framework/threadpool.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include #include @@ -25,10 +26,11 @@ limitations under the License. */ namespace paddle { namespace framework { -typedef std::function Task; - class ThreadPool { public: + typedef std::packaged_task Task; + typedef std::function Fun; + /** * @brief Get a instance of threadpool, the thread number will * be specified as the number of hardware thread contexts @@ -61,13 +63,18 @@ class ThreadPool { /** * @brief Push a function to the queue, and will be scheduled and * executed if a thread is available. - * @param[in] Task will be pushed to the task queue. + * @param[in] Task, will be pushed to the task queue. + * @return std::future, we could wait for the task finished by + * f.wait(). */ - void Run(const Task& fn) { + std::future Run(const Fun& fn) { std::unique_lock lock(mutex_); - tasks_.push(fn); + Task task(std::bind(fn)); + std::future f = task.get_future(); + tasks_.push(std::move(task)); lock.unlock(); scheduled_.notify_one(); + return f; } /** @@ -110,7 +117,7 @@ class ThreadPool { break; } // pop a task from the task queue - auto task = tasks_.front(); + auto task = std::move(tasks_.front()); tasks_.pop(); --available_; diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc index 012d92a5edc415f0bb2f8a0ea38ffeb9549d54fa..50b6238cd8786be9d8cf2d5f821daadea12bd208 100644 --- a/paddle/framework/threadpool_test.cc +++ b/paddle/framework/threadpool_test.cc @@ -20,16 +20,21 @@ limitations under the License. */ namespace framework = paddle::framework; void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { + std::vector> fs; for (int i = 0; i < cnt; ++i) { - pool->Run([&sum]() { sum.fetch_add(1); }); + auto f = pool->Run([&sum]() { sum.fetch_add(1); }); + fs.push_back(std::move(f)); + } + for (auto& f : fs) { + f.wait(); } } TEST(ThreadPool, ConcurrentInit) { framework::ThreadPool* pool; - int concurrent_cnt = 50; + int n = 50; std::vector threads; - for (int i = 0; i < concurrent_cnt; ++i) { + for (int i = 0; i < n; ++i) { std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); threads.push_back(std::move(t)); } @@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) { } } -TEST(ThreadPool, ConcurrentStart) { +TEST(ThreadPool, ConcurrentRun) { framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); std::atomic sum(0); std::vector threads; - int concurrent_cnt = 50; + int n = 50; // sum = (n * (n + 1)) / 2 - for (int i = 1; i <= concurrent_cnt; ++i) { + for (int i = 1; i <= n; ++i) { std::thread t(do_sum, pool, std::ref(sum), i); threads.push_back(std::move(t)); } @@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) { t.join(); } pool->Wait(); - EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); + EXPECT_EQ(sum, ((n + 1) * n) / 2); } diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index bd8973eeb369aabd2c52d4fccf799657c564ee78..7d002b9ea0b597730685ee03b021c4982f787f49 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const { case proto::VarDesc::LOD_TENSOR_ARRAY: return desc_.tensor_array().tensor(); default: - PADDLE_THROW("Unexpected branch."); + PADDLE_THROW("The type of var '", this->Name(), "' is unsupported."); } } diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index de7b70e271b38ebe3a4c38704d0cced47d010788..cbdbf5335d32d55a0221728758025c9d2cb3e7d1 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -126,14 +126,165 @@ public: inputData += inputChannels * inputHeight * inputWidth; outputData += outputChannels * outputHeight * outputWidth; } + } +}; + #ifdef PADDLE_MOBILE_INFERENCE - if (Device == DEVICE_TYPE_CPU) { - memory_.reset(); + +/* + * \brief Forward calculation of convolution, optimized for mobile. + */ +template +class GemmConvMobileFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + // TODO(hedaoyuan): Need to define some index macros, + // to avoid useing 0 and 1. + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + real beta; + if (outputs[0].getArgType() == ADD_TO) { + beta = 1.0; + } else { + beta = 0.0; } -#endif + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + + real* inputData = inputs[0].data(); + real* filterData = inputs[1].data(); + real* outputData = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + + TensorShape colShape; + real* colData = NULL; + + size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; + size_t colWidth = outputHeight * outputWidth; + // Max col matrix height 256, Max col matrix width 1024 + size_t stepColHeight = std::min(colHeight, static_cast(256)); + size_t stepColWidth = std::min(colWidth, static_cast(2048)); + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + colData = reinterpret_cast(memory_->getBuf()); + } + + Im2ColMobileFunctor im2col; + size_t inputOffset = imShape.getElements(); + size_t outputOffset = + (outputChannels / groups_) * outputHeight * outputWidth; + size_t filterOffset = filter.getElements() / groups_; + + int nStride = colWidth; + int kStride = colHeight; + for (size_t i = 0; i < batchSize; i++) { + for (size_t g = 0; g < groups_; g++) { + if (needIm2col) { + real beta_ = beta; + for (size_t colHeightStart = 0; colHeightStart < colHeight; + colHeightStart += stepColHeight) { + for (size_t colWidthStart = 0; colWidthStart < colWidth; + colWidthStart += stepColWidth) { + int N = std::min(colWidth - colWidthStart, stepColWidth); + int K = std::min(colHeight - colHeightStart, stepColHeight); + // im2col + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW(), + dilationH(), + dilationW(), + colHeightStart, + K, + colWidthStart, + N); + + // gemm + int M = outputChannels / groups_; + BlasGemm::compute( + false, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset + colHeightStart, + kStride, + colData, + N, + beta_, + outputData + g * outputOffset + colWidthStart, + nStride); + } + beta_ = 1.0; + } + } else { + int M = outputChannels / groups_; + int N = outputHeight * outputWidth; + int K = inputChannels / groups_ * filterHeight * filterWidth; + BlasGemm::compute(false, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + inputData + g * inputOffset, + N, + beta, + outputData + g * outputOffset, + N); + } + } + inputData += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + + memory_.reset(); } }; +#endif + /* * \brief Backward input calculation of convolution. */ @@ -348,7 +499,11 @@ public: } }; +#ifdef PADDLE_MOBILE_INFERENCE +REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction); +#else REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); +#endif REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 0c37fc972484bfbede01d23652e384071bf883af..36a9bcf84e4b14965c83627821b71d1c7c0da1b2 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -98,4 +98,54 @@ public: int dilationWidth = 1); }; +template +class Im2ColMobileFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int dilationHeight, + int dilationWidth, + int colHeightStart, + int colHeightSize, + int colWidthStart, + int colWidthSize) { + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputWidth = colShape[4]; + + for (int colh = 0; colh < colHeightSize; colh++) { + int wOffset = (colHeightStart + colh) % filterWidth; + int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; + int c_im = (colHeightStart + colh) / filterWidth / filterHeight; + + for (int colw = 0; colw < colWidthSize; colw++) { + int h = (colWidthStart + colw) / outputWidth; + int w = (colWidthStart + colw) % outputWidth; + + int imRowIdx = h * strideHeight + hOffset * dilationHeight; + int imColIdx = w * strideWidth + wOffset * dilationWidth; + if ((imRowIdx - paddingHeight) < 0 || + (imRowIdx - paddingHeight) >= inputHeight || + (imColIdx - paddingWidth) < 0 || + (imColIdx - paddingWidth) >= inputWidth) { + colData[colh * colWidthSize + colw] = static_cast(0); + } else { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + colData[colh * colWidthSize + colw] = + imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + } +}; + } // namespace paddle diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 1f085538d81904dbd5b5d6bcd014adaed22e37d7..3ba866dcdd845403d52f7a85adfef08cbb11c305 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor(); } #endif +template +void TestIm2ColMobileFunctor() { + for (size_t channels : {32}) { + for (size_t inputHeight : {33, 100}) { + for (size_t inputWidth : {32, 96}) { + for (size_t filterHeight : {5}) { + for (size_t filterWidth : {7}) { + for (size_t stride : {2}) { + for (size_t padding : {1}) { + for (size_t dilation : {1, 3}) { + size_t filterSizeH = (filterHeight - 1) * dilation + 1; + size_t filterSizeW = (filterWidth - 1) * dilation + 1; + if (inputHeight + 2 * padding < filterSizeH || + inputWidth + 2 * padding < filterSizeW) + break; + if (padding >= filterSizeH || padding >= filterSizeW) break; + size_t outputHeight = + (inputHeight - filterSizeH + 2 * padding) / stride + 1; + size_t outputWidth = + (inputWidth - filterSizeW + 2 * padding) / stride + 1; + + TensorShape imShape = + TensorShape({channels, inputHeight, inputWidth}); + TensorShape colShape1 = TensorShape({channels, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + size_t height = channels * filterHeight * filterWidth; + size_t width = outputHeight * outputWidth; + VectorPtr input1 = + Vector::create(imShape.getElements(), false); + VectorPtr input2 = + Vector::create(imShape.getElements(), false); + MatrixPtr output1 = + Matrix::create(height, width, false, false); + MatrixPtr output2 = + Matrix::create(height, width, false, false); + input1->uniform(0.001, 1); + input2->copyFrom(*input1); + + Im2ColFunctor im2Col1; + Im2ColMobileFunctor im2Col2; + im2Col1(input1->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding, + dilation, + dilation); + im2Col2(input2->getData(), + imShape, + output2->getData(), + colShape1, + stride, + stride, + padding, + padding, + dilation, + dilation, + 0, + height, + 0, + width); + + autotest::TensorCheckEqual(*output1, *output2); + } + } + } + } + } + } + } + } +} + +TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor(); } + } // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp index 741984bb68d3881f6ac26eaca7790190ed6e572a..ac217f1363dbd0360645bbe07cd71a17cc931a79 100644 --- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp @@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap, } /* the size of inputs for norm-layer is 1 */ - CHECK_EQ(config_.inputs_size(), 1UL); + CHECK_EQ(config_.inputs_size(), 1); const NormConfig& conf = config_.inputs(0).norm_conf(); localSize_ = conf.size(); alpha_ = conf.scale(); diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index c6da04b5b409c6419e30154e7853f99bcc59cdc5..7f56737ca99906ad7a62ec2fb8c4a6c1f3d5a221 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -1,5 +1,6 @@ file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +set(DEPS_OPS "") set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") function(op_library TARGET) @@ -48,6 +49,11 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) @@ -181,56 +187,26 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) -set(DEPS_OPS - cond_op - cross_entropy_op - recurrent_op - softmax_with_cross_entropy_op - softmax_op - sequence_softmax_op - sum_op - pool_op - maxout_op - unpool_op - pool_with_index_op - conv_op - conv_transpose_op - nccl_op - sequence_conv_op - sequence_pool_op - lod_rank_table_op - lod_tensor_to_array_op - array_to_lod_tensor_op - max_sequence_len_op - lstm_op - tensor_array_read_write_op - gru_op - adagrad_op - sgd_op - save_op - load_op - send_op - recv_op - cos_sim_op) +if(WITH_GPU) + op_library(nccl_op DEPS nccl_common) +else() + set(DEPS_OPS ${DEPS_OPS} nccl_op) +endif() if(WITH_DISTRIBUTE) -add_subdirectory(detail) -op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - send_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - recv_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) + add_subdirectory(detail) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) +else() + set(DEPS_OPS ${DEPS_OPS} send_op recv_op) endif() -op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) +op_library(cond_op DEPS framework_proto tensor net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) @@ -243,22 +219,17 @@ op_library(pool_op DEPS pooling) op_library(maxout_op DEPS maxouting) op_library(unpool_op DEPS unpooling) op_library(pool_with_index_op DEPS pooling) -op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) -op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) -op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) -op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table) -op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc) -if(WITH_GPU) -op_library(nccl_op DEPS nccl_common) -endif() +op_library(lod_rank_table_op DEPS lod_rank_table) +op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) -op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) +op_library(recurrent_op DEPS executor) op_library(cos_sim_op DEPS cos_sim_functor) - # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) @@ -271,13 +242,12 @@ endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") - cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) if(WITH_GPU) - cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 060ffac8273724fad48753d9a7e2b0affbb6f25e..e0eef5d9f93d70930ee82d663de9610cc0176e33 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); size_t offset; if (platform::is_gpu_place(i_tensor.place())) { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index 0aa04c268bd65dc246341a2f335c362496050bdb..49366fee8df5a44a97b7b4e87cbf0b7c813a414a 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { } auto slice = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0560040509026e84eb543331996a6846751b8506..7d77be3be1034bb38f6c92c181aa525214073eec 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc index 52c28e7f532f9751589176c8d37362620167cf63..72e05607b0b612807d552b4c45b58f9d9ce9c2af 100644 --- a/paddle/operators/beam_search_decode_op.cc +++ b/paddle/operators/beam_search_decode_op.cc @@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope& scope, const platform::Place& dev_place) const override { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); framework::ExecutionContext ctx(*this, scope, dev_ctx); diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 455fbd8ca3f5083fac51776524daca6f6a029667..e333002bfd1ab40c62882f09cd207a12a0939648 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const { // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index ab52a41b539236f1691ce8bc02d31e336ee4ccbb..e65a5dce52c3c51d3d6bee1684c1e97230203d38 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector paddings = ctx->Attrs().Get>("paddings"); int groups = ctx->Attrs().Get("groups"); std::vector dilations = ctx->Attrs().Get>("dilations"); - int input_channels = in_dims[1]; - int output_channels = filter_dims[0]; PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "Conv intput should be 4-D or 5-D tensor."); @@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ( paddings.size(), strides.size(), "Conv paddings dimension and Conv strides dimension should be the same."); + + int input_channels = in_dims[1]; PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, "The number of input channels should be equal to filter " "channels * groups."); + + int output_channels = filter_dims[0]; PADDLE_ENFORCE_EQ( output_channels % groups, 0, "The number of output channels should be divided by groups."); diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index a9c5c7046f8642072f2fdef14e1800ce7b05915b..fe39cb481aa63fa401603d97778b337282511ab5 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", - "(Tensor, default Tensor), a 2-D tensor with shape N x D, " - "where N is the batch size and D is the number of classes. " + "(Tensor, default Tensor), a 2-D tensor with shape [N x D]," + " where N is the batch size and D is the number of classes. " "This input is a probability computed by the previous operator, " "which is almost always the result of a softmax operator."); AddInput("Label", "(Tensor), the ground truth which is a 2-D tensor. When " "soft_label is set to false, Label is a Tensor with shape " "[N x 1]. When soft_label is set to true, Label is a " - "Tensor with shape [N x K]."); + "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " "[N x 1]. The cross entropy loss."); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index cecbb7226aedabc3bee63c518d0ab9da496fca11..48da52c3b68879a1da8550a5448090f9f1e715d3 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { auto *out_item = out_var->GetMutable(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index fa20a06540efef05d6a15669b1f8dfb8bd5927bc..387d1e0a747f71d85826b52d140c2838112227f6 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index fe0706c4a9da864025737584b72c02cca83c956b..dcd43a30c86b62d79f52ac640f14b295a062146c 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index 57b4ec69384a55cef6d6d8d0c0145caae837b3f6..084ba1db62de0a6bf6829f8e9f4c274fb777e879 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(tensor, place, dev_ctx, &out); } } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 5425375c1ffa16c223ec1b05506ffb4b4394d6e1..08b972a233aab8596a5ce7f74ea903df3b8ef0f2 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase { out_var_name); auto *tensor = out_var->GetMutable(); - framework::DeserializeFromStream(fin, tensor); + DeserializeFromStream(fin, tensor); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (platform::is_gpu_place(place)) { // copy CPU to GPU diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index ed99915bb7fc312cac955b50ffbc3237aee503b0..8d164b4abc54722a95a176dfe8ed341f8c5125d1 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 830ae53cbe95ce1056abdaa89325b031ccdead2a..7ebcfb9ab9f30e3b0f13d3646a59d008335b232d 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -9,9 +9,9 @@ if(WITH_GPU) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) - nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) + nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) - nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) + nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) @@ -24,9 +24,9 @@ else() cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) - cc_library(vol2col SRCS vol2col.cc DEPS device_context) + cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(context_project SRCS context_project.cc DEPS device_context math_function) - cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) + cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context) diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 2b35e4532a9c9f72f473020d472244234af24248..d4f12f0a106e077ac31aa37f46857b74e1e99b59 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -302,8 +302,29 @@ void set_constant(const platform::DeviceContext& context, #endif } +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; + } + } +}; + template struct RowwiseAdd; template struct RowwiseAdd; + template struct ColwiseSum; template struct ColwiseSum; diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 927838a0948d2df5701b8e9189f59cdd66396b52..d47a7f818ded61baf31e46ea3b8ae3101324111f 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -273,6 +273,35 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } +template +__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width, + int num) { + T tmp = 1.0 / width; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + int h = i * tmp; + int w = i - h * width; + c[i] = a[i] + b[w]; + } +} + +template +struct RowwiseAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + int blocks = 512; + int grids = (input.numel() + blocks - 1) / blocks; + RowwiseAddKernel<<>>( + input.data(), vector.data(), output->data(), + static_cast(in_dims[1]), static_cast(input.numel())); + } +}; + template struct RowwiseAdd; template struct RowwiseAdd; template struct ColwiseSum; diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index ddd798dace17012b7d9a949567a90d48067e6b15..de591626df28e2bc3391b609f909612411398247 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -45,25 +45,6 @@ void Transpose::operator()( eigen_out.device(*dev) = eigen_in.shuffle(permute); } -template -void RowwiseAdd::operator()(const DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { - auto in_dims = input.dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); - - auto in = framework::EigenMatrix::From(input); - auto vec = framework::EigenMatrix::From(vector); - auto out = framework::EigenMatrix::From(*output); - Eigen::array shape({{1, static_cast(size)}}); - Eigen::array bcast({{static_cast(in_dims[0]), 1}}); - out.device(*context.eigen_device()) = - in + vec.reshape(shape).broadcast(bcast); -} - template void ColwiseSum::operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 2287f347910e83c25d2155b80670f9d991c1e5b2..3f999e404f8afe6bded09c820509fa0f36d30bf6 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index 34a6e1a58d4bb8b267f3b6b3a0ac689c17036c37..6546096069d4c3fbc4908a16c2dba2ac6d7e6421 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -305,7 +305,7 @@ int main(int argc, char **argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 71769e67c7032f2d808d1394883cbe93f826b2f6..056fa46949cd623845956521b068109085a8795e 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { false /*create_local_scope*/); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output @@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { auto *program = block->Program(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc index 1063388e2539d47ca6ab56cd5fb0b946b6fb9147..8d652ff806461cea3d0e8d3bd70704b4b6bc2173 100644 --- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { auto x_sliced = x.Slice(x_offset, x_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); out_offset += len; return out_offset; diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index d045a8b5b8d852278fd5140b9cf8707462b93c93..4b1cbe88836e340c94f797806243a6768410ed3d 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::SerializeToStream(fout, tensor, dev_ctx); } diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 6e829386832a875769669b02e6f41755618d348a..95c207221a7b34732eca4cfd07fed0a8f1671981 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Recv operator -This operator will recv tensor from send_op +This operator will send tensor to recv_op. )DOC"); AddAttr>("endpoints", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e8a4773547861478b1771562ecd9e8b0bb3c3739..e5ef0740b6f385de7f17a3a419000cb8c897d986 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 89826ca6ee98d579f8b7c8795b6dc33cfa158ee1..2d8787d740c70f1d4696fdec381b572ecf031f57 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 9529aab573a73f55f7a67d39cde99aca023f473e..53e38ec70336ca7f2d7c142e5fb1bbe427ab2957 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); @@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index e450ef32a432a9dede05835341a166454a551889..ea07f2e002cb76d09a11f7a5305c2d45b780e7bd 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -17,7 +17,7 @@ namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Borrow( +const platform::DeviceContext* DeviceContextPool::Get( const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { @@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( return it->second; } -std::vector DeviceContextPool::Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto it = device_contexts_.find(place); - if (it != device_contexts_.end()) { - borrowed_contexts.emplace_back(it->second); - } else { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - } - return borrowed_contexts; -} - DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 8ba12e1657b6671b36fa9ad3f498820d61af585e..2b366e6383d23e2d31a194edd04412892a8311eb 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext { std::unique_ptr eigen_device_; }; +template +struct DefaultDeviceContextType; + +template <> +struct DefaultDeviceContextType { + using TYPE = CPUDeviceContext; +}; + #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; @@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext { cublasHandle_t cublas_handle_; }; +template <> +struct DefaultDeviceContextType { + using TYPE = CUDADeviceContext; +}; + class CUDNNDeviceContext : public CUDADeviceContext { public: explicit CUDNNDeviceContext(CUDAPlace place); @@ -109,13 +122,13 @@ class DeviceContextPool { public: explicit DeviceContextPool(const std::vector& places); - static DeviceContextPool& Get() { + static DeviceContextPool& Instance() { PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); return *pool; } /*! \brief Create should only called by Init function */ - static DeviceContextPool& Create(const std::vector& places) { + static DeviceContextPool& Init(const std::vector& places) { if (pool == nullptr) { pool = new DeviceContextPool(places); } @@ -123,13 +136,14 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Borrow(const platform::Place& place); - - /*! \brief Return handle of multi-device context. */ - std::vector Borrow( - const std::vector& places); + const platform::DeviceContext* Get(const platform::Place& place); - ~DeviceContextPool() {} + template + const typename DefaultDeviceContextType::TYPE* GetByPlace( + const Place& place) { + return reinterpret_cast< + const typename DefaultDeviceContextType::TYPE*>(Get(place)); + } private: static DeviceContextPool* pool; diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 91011bf71ccf4295075aa2ed73dffa92cb22d773..ca10cf34639376798bf5ba05970c9c734e5a1ef8 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; - DeviceContextPool& pool = DeviceContextPool::Get(); - auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); - auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); - EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(CUDAPlace(i)); - } - auto dev_ctxs = pool.Borrow(gpu_places); - for (size_t i = 0; i < dev_ctxs.size(); ++i) { - auto* dev_ctx = static_cast(dev_ctxs[i]); - - // check same as CUDAPlace(i) - CUDAPlace place = boost::get(dev_ctx->GetPlace()); - EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); } } int main(int argc, char** argv) { - int dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " - "device count is " - << dev_count; - return 0; - } - std::vector places; places.emplace_back(paddle::platform::CPUPlace()); @@ -109,7 +94,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h index 5427aa28238d6b46eb72d1fb49303dce3d871d7d..694a66d9ac4eb6ad02daf1931806fa1287de7cab 100644 --- a/paddle/platform/for_range.h +++ b/paddle/platform/for_range.h @@ -62,7 +62,7 @@ struct ForRange { template inline void operator()(Function func) const { - constexpr size_t num_threads = 1024; + constexpr int num_threads = 1024; int block_size = limit_ <= num_threads ? limit_ : num_threads; int grid_size = (limit_ + num_threads - 1) / num_threads; diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 8f815863a72b118a0a258a2aa4276a33b95b0e46..ef6d845874745af1150e4425f8d6be416cc44ece 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -144,7 +144,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/place.h b/paddle/platform/place.h index d25eaa689f4a4baa951db5c61bbf99288e365ee1..76b5c502cc48431a4e9b13b07505978884576e1d 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include - +#include "paddle/platform/enforce.h" #include "paddle/platform/variant.h" namespace paddle { @@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &); std::ostream &operator<<(std::ostream &, const Place &); +template +struct PlaceVisitorWrapper + : public boost::static_visitor { + const Visitor &visitor_; + explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {} + + typename Visitor::result_type operator()(const CPUPlace &cpu) const { + return visitor_(cpu); + } + + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { +#ifdef PADDLE_WITH_CUDA + return visitor_(cuda); +#else + PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device"); + return typename Visitor::result_type(); +#endif + } +}; + +template +typename Visitor::result_type VisitPlace(const Place &place, + const Visitor &visitor) { + return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); +} + } // namespace platform } // namespace paddle diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 6afed7eec7001b646d55cef0bc3f59782b80b15f..ced75cbfd899980390d41610d863d6cf154570b0 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -3,6 +3,7 @@ if(WITH_PYTHON) SRCS pybind.cc exception.cc protobuf.cc const_value.cc DEPS pybind python backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB}) + target_link_libraries(paddle_pybind rt) endif(WITH_PYTHON) if(WITH_DOC) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index f105370f226e2cceaac685f280d55134d4291028..07292d47e9c165c67fe4a30ee7d851c350beb2e0 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) { std::string name = byte_name; return self.HasVar(name); }) + .def("has_var_recursive", + [](BlockDesc &self, py::bytes byte_name) { + std::string name = byte_name; + return self.HasVarRecursive(name); + }) .def("find_var", [](BlockDesc &self, py::bytes byte_name) { std::string name = byte_name; return self.FindVar(name); }, py::return_value_policy::reference) + .def("find_var_recursive", + [](BlockDesc &self, py::bytes byte_name) { + std::string name = byte_name; + return self.FindVarRecursive(name); + }, + py::return_value_policy::reference) .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference) .def("op_size", &BlockDesc::OpSize) .def("op", &BlockDesc::Op, py::return_value_policy::reference) @@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) { .def("set_shape", &VarDesc::SetShape) .def("set_dtype", &VarDesc::SetDataType) .def("shape", &VarDesc::Shape, py::return_value_policy::reference) - .def("dtype", &VarDesc::GetDataType) + .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference) .def("lod_level", &VarDesc::GetLodLevel) .def("set_lod_level", &VarDesc::SetLoDLevel) .def("type", &VarDesc::GetType) @@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) { .value("BLOCK", proto::AttrType::BLOCK); py::class_ op_desc(m, "OpDesc", ""); - op_desc.def("type", &OpDesc::Type) + op_desc + .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); }, + py::return_value_policy::reference) + .def("copy_from", &OpDesc::CopyFrom) + .def("type", &OpDesc::Type) .def("set_type", &OpDesc::SetType) .def("input", &OpDesc::Input) .def("input_names", &OpDesc::InputNames) - .def("set_input", &OpDesc::SetInput) .def("output", &OpDesc::Output) .def("output_names", &OpDesc::OutputNames) + .def("set_input", &OpDesc::SetInput) .def("set_output", &OpDesc::SetOutput) + .def("input_arg_names", &OpDesc::InputArgumentNames) + .def("output_arg_names", &OpDesc::OutputArgumentNames) + .def("rename_input", &OpDesc::RenameInput) + .def("rename_output", &OpDesc::RenameOutput) .def("has_attr", &OpDesc::HasAttr) .def("attr_type", &OpDesc::GetAttrType) .def("attr_names", &OpDesc::AttrNames) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 07e38476e68b79f5b3192c619c89cd0e061cc686..04485ce7c1ab87f8655b0e6cbaecc36b3382f647 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle. } return ret_values; }); - m.def("get_grad_op_descs", - [](const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map &grad_to_var, - const std::vector &grad_sub_block) { - std::vector> grad_op_descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, &grad_to_var, - grad_sub_block); - std::vector grad_op_desc_ptrs(grad_op_descs.size()); - std::transform( - grad_op_descs.begin(), grad_op_descs.end(), - grad_op_desc_ptrs.begin(), - [](std::unique_ptr &p) { return p.release(); }); - return grad_op_desc_ptrs; - }); + m.def( + "get_grad_op_desc", [](const OpDesc &op_desc, + const std::unordered_set &no_grad_set, + const std::vector &grad_sub_block) { + std::unordered_map grad_to_var; + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, &grad_to_var, + grad_sub_block); + std::vector grad_op_desc_ptrs(grad_op_descs.size()); + std::transform(grad_op_descs.begin(), grad_op_descs.end(), + grad_op_desc_ptrs.begin(), + [](std::unique_ptr &p) { return p.release(); }); + return std::make_pair(grad_op_desc_ptrs, grad_to_var); + }); m.def("prune", [](const ProgramDesc &origin, const std::vector> &targets) { ProgramDesc prog_with_targets(origin); @@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle. InferenceOptimize(*(origin.Proto()), &pruned_desc); return new ProgramDesc(pruned_desc); }); + m.def("empty_var_name", []() { return framework::kEmptyVarName; }); + m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; }); m.def_submodule( "var_names", "The module will return special predefined variable name in Paddle") diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 67244d82602906231ac1fc870adccc7e82869407..4d5e73e2c266b301de4f19e09be7ab4009c936d3 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -63,9 +63,10 @@ struct CastToPyBufferImpl { auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); auto dev_ctx = static_cast( - pool.Borrow(tensor.place())); + pool.Get(tensor.place())); paddle::platform::GpuMemcpyAsync( dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), @@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto dev_ctx = - static_cast(pool.Borrow(place)); + static_cast(pool.Get(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index e43b9c218a3ecb9e7f20fb7e8b14a85a29947eef..92039ec6b05d224e702f0ba5dc05c057a492287e 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -178,7 +178,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip dmidecode && pip install -U pip && \ + apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index a94bc01b358c508132eb85920a2d4c0aa934dd51..8a352b0078d701f797f7202c85bd0e08201ac9b8 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -71,9 +71,7 @@ function threads_config() { # auto set OMP_NUM_THREADS and MKL_NUM_THREADS # according to trainer_count and total processors # only when MKL enabled - if [ "@WITH_MKL@" == "OFF" ]; then - return 0 - fi + # auto set OPENBLAS_NUM_THREADS when do not use MKL processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` if [ -z $trainers ]; then @@ -83,12 +81,19 @@ function threads_config() { if [ $threads -eq 0 ]; then threads=1 fi - if [ -z "$OMP_NUM_THREADS" ]; then - export OMP_NUM_THREADS=$threads - fi - if [ -z "$MKL_NUM_THREADS" ]; then - export MKL_NUM_THREADS=$threads + if [ "@WITH_MKL@" == "ON" ]; then + if [ -z "$OMP_NUM_THREADS" ]; then + export OMP_NUM_THREADS=$threads + fi + if [ -z "$MKL_NUM_THREADS" ]; then + export MKL_NUM_THREADS=$threads + fi + else + if [ -z "$OPENBLAS_NUM_THREADS" ]; then + export OPENBLAS_NUM_THREADS=$threads + fi fi + } PADDLE_CONF_HOME="$HOME/.config/paddle" @@ -150,7 +155,7 @@ fi case "$1" in "train") threads_config $@ - # echo $OMP_NUM_THREADS $MKL_NUM_THREADS + # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ;; "merge_model") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 634388094c804827657dc83d5c205e680625b156..7bdddeaabec733ef26b3f766c6437f5c53d65044 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid'] DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' -DATA_MD5 = '52808999861908f626f3c1f4e79d11fa' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' # In official 'readme', tstid is the flag of test data diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index c72b5730695dbc4f772015f1fb8dec6814cd1837..225b41c5043b5792abb90bbad53cbbfce9a3156e 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -36,7 +36,7 @@ def __read_gflags_from_env__(): """ import sys import core - read_env_flags = ['use_pinned_memory'] + read_env_flags = ['use_pinned_memory', 'check_nan_inf'] if core.is_compile_gpu(): read_env_flags.append('fraction_of_gpu_memory_to_use') core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index f188582178f667125ec95cd230100fdb10ce7e88..6966cc75804b6b5a49ceb45a26994c23d2936bdb 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,17 +1,209 @@ from paddle.v2.fluid import framework as framework +from . import core +import collections -__all__ = ['append_backward_ops'] +__all__ = ['append_backward'] -def append_backward_ops(loss, parameter_list=None, no_grad_set=None): +def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, + end_idx=None): + if begin_idx is None: + begin_idx = 0 + if end_idx is None: + end_idx = len(op_desc_list) + for i in range(begin_idx, end_idx): + op_desc = op_desc_list[i] + if isinstance(op_desc, tuple): + op_desc = op_desc[0] + op_desc.rename_input(old_name, new_name) + op_desc.rename_output(old_name, new_name) + + +def _create_op_desc_(op_type, inputs, outputs, attrs): + op_desc = core.OpDesc() + op_desc.set_type(op_type) + for para, args in inputs.iteritems(): + op_desc.set_input(para, args) + for para, args in outputs.iteritems(): + op_desc.set_output(para, args) + for name, val in attrs.iteritems(): + if isinstance(val, framework.Block): + op_desc.set_block_attr(name, val.desc) + else: + op_desc.set_attr(name, val) + return op_desc + + +def _infer_var_data_type_(var_name, block): + grad_var = block.desc.find_var(var_name.encode("ascii")) + fwd_name = _strip_grad_suffix_(var_name.encode("ascii")) + if block.desc.has_var_recursive(fwd_name): + fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii")) + grad_var.set_dtype(fwd_var.dtype()) + else: + grad_var.set_dtype(core.DataType.FP32) + + +def _all_in_set_(cands, s): + for c in cands: + if not c in s: + return False + return True + + +def _strip_grad_suffix_(name): + pos = name.find(core.grad_var_suffix()) + return name[:pos] if pos != -1 else name + + +def _append_grad_suffix_(name): + return name + core.grad_var_suffix() + + +def _addup_repetitive_outputs_(op_descs): + # In backward part, an variable my be the output of more than one ops. + # In this case, the variable should be the accumulation of all the outputs. + # We adopt adding `sum_op`s to implement the accumulate. + pending_sum_ops = [] + var_rename_count = collections.defaultdict(int) + renamed_vars = collections.defaultdict(list) + for idx, op_desc in enumerate(op_descs): + for var_name in op_desc.input_arg_names(): + if len(renamed_vars[var_name]) > 1: + pending_sum_ops.append( + (_create_op_desc_("sum", {"X": renamed_vars[var_name]}, + {"Out": [var_name]}, {}), idx)) + renamed_vars[var_name] = [var_name] + for var_name in op_desc.output_arg_names(): + if var_name == core.empty_var_name( + ) or var_name in op_desc.input_arg_names(): + # empty variable or inplace op + continue + if len(renamed_vars[var_name]) == 0: + # it's the first time we get the variable + renamed_vars[var_name] = [var_name] + else: + if len(renamed_vars[var_name]) == 1: + new_name = var_name + "@RENAME@" + \ + str(var_rename_count[var_name]) + var_rename_count[var_name] += 1 + # rename original var_name + renamed_vars[var_name][0] = new_name + _rename_arg_(op_descs, var_name, new_name, 0, idx) + _rename_arg_(pending_sum_ops, var_name, new_name) + + new_name = var_name + "@RENAME@" + \ + str(var_rename_count[var_name]) + var_rename_count[var_name] += 1 + op_desc.rename_output(var_name, new_name) + renamed_vars[var_name].append(new_name) + for var_name, inputs in renamed_vars.iteritems(): + if len(inputs) > 1: + pending_sum_ops.append((_create_op_desc_( + "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs))) + # sum_op descs are sorted according to their insert position + for p in reversed(pending_sum_ops): + op_descs.insert(p[1], p[0]) + + return op_descs + + +def _remove_no_grad_branch_(op_descs, no_grad_set): + # Remove ops whose outputs are all in no_grad_dict + op_descs = filter( + lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set), + op_descs) + # Insert fill_zeros_like_op + to_insert = [] + for idx, op_desc in enumerate(op_descs): + for arg in op_desc.input_arg_names(): + if core.grad_var_suffix() in arg and arg in no_grad_set: + to_insert.append((_create_op_desc_("fill_zeros_like", { + "X": [_strip_grad_suffix_(arg)] + }, {"Y": [arg]}, {}), idx)) + + map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert)) + + return op_descs + + +def _append_backward_ops_(target, + block, + target_block, + no_grad_dict, + grad_to_var, + callback=None): + grad_op_descs = [] + program = block.program + for op in reversed(block.ops): + grad_sub_block_list = [] + # If the op has its own sub-block, deal with the sub-block first + if op.has_attr("sub_block"): + sub_block = program.block(op.block_attr("sub_block")) + grad_sub_block = program.create_block(parent_idx=sub_block.idx) + _append_backward_ops_(target, sub_block, grad_sub_block, + no_grad_dict, grad_to_var, callback) + grad_sub_block_list.append(grad_sub_block.desc) + + grad_op_desc, op_grad_to_var = core.get_grad_op_desc( + op.desc, no_grad_dict[block.idx], grad_sub_block_list) + grad_op_descs.extend(grad_op_desc) + grad_to_var.update(op_grad_to_var) + + grad_op_descs = _addup_repetitive_outputs_(grad_op_descs) + + grad_op_descs = _remove_no_grad_branch_(grad_op_descs, + no_grad_dict[block.idx]) + + if target_block.idx == 0: + grad_op_descs.insert( + 0, + _create_op_desc_("fill_constant", {}, { + "Out": [_append_grad_suffix_(target.name)] + }, {"shape": [1], + "value": 1.0, + "dtype": target.dtype})) + # append op_desc in grad_op_descs to target_block + for op_desc in grad_op_descs: + new_op_desc = target_block.desc.append_op() + new_op_desc.copy_from(op_desc) + + +def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): + for op_idx in range(start_op_idx, block.desc.op_size()): + op_desc = block.desc.op(op_idx) + if op_desc.has_attr("sub_block"): + sub_block = block.program.block(op_desc.block_attr("sub_block")) + _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) + new_vars = set() + # create new gradient variables + for grad_var_name in op_desc.output_arg_names(): + grad_var_name = grad_var_name.encode("ascii") + if block.desc.has_var_recursive( + grad_var_name) or grad_var_name == core.empty_var_name(): + continue + block.desc.var(grad_var_name) + new_vars.add(grad_var_name) + if not grad_to_var.has_key(grad_var_name): + continue + grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block) + # infer_shape and infer_type + op_desc.infer_var_type(block.desc) + op_desc.infer_shape(block.desc) + for arg in op_desc.output_arg_names(): + if arg in new_vars: + _infer_var_data_type_(arg, block) + + +def append_backward(loss, parameter_list=None, no_grad_set=None): """ Create and add gradient Operators in BlockDesc to compute gradients of `loss` for parameters in parameter_list :param loss: an variable generated by cost function. :type loss: Variable - :param no_grad_set: variable that should not create gradient - :type no_grad_set: set + :param no_grad_dict: variable that should not create gradient + :type no_grad_dict: set :param parameter_list: parameters that need to compute gradient and update to optimize the lost. :type: list @@ -20,35 +212,53 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): """ assert isinstance(loss, framework.Variable) + program = loss.block.program + no_grad_dict = dict() if no_grad_set is None: - program = loss.block.program assert isinstance(program, framework.Program) - no_grad_set = list() for block in program.blocks: assert isinstance(block, framework.Block) + block_no_grad_set = set() for var in block.vars.itervalues(): assert isinstance(var, framework.Variable) if var.stop_gradient: - no_grad_set.append(var.name) - no_grad_set = set(no_grad_set) + block_no_grad_set.add(_append_grad_suffix_(var.name)) + no_grad_dict[block.idx] = block_no_grad_set + elif isinstance(no_grad_set, set): + no_grad_dict = {0: no_grad_set} + else: + raise ValueError("'no_grad_set' should be a set or None.") + + grad_info_map = dict() + root_block = program.block(0) + + fwd_op_num = root_block.desc.op_size() + current_block_idx = program.current_block_idx + grad_to_var = dict() + + _append_backward_ops_(loss, root_block, root_block, no_grad_dict, + grad_to_var) + _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) + + program.current_block_idx = current_block_idx + program.sync_with_cpp() - param_grad_map = loss.block.program.append_backward(loss, no_grad_set) if parameter_list is not None: parameters = parameter_list else: - params = loss.block.program.global_block().all_parameters() + params = program.global_block().all_parameters() parameters = [param.name for param in params] params_and_grads = [] for param in parameters: - if param not in param_grad_map: + if param not in grad_info_map: raise ValueError("param %s is not in map" % param) - grad_info = param_grad_map[param] - grad_block = loss.block.program.block(grad_info[1]) + grad_info = grad_info_map[param] + grad_block = grad_info[1] if not grad_block.has_var(grad_info[0]): raise ValueError("grad block[{0}] did not have grad var {1}".format( grad_info[1], grad_info[0])) # Get the param var from the global block - param_var = loss.block.program.global_block().var(param) + param_var = program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py index 30a542af212926c93381aade426e25f2117e4662..24036c3e75b9594ba58cccb02825ab8020d1e107 100644 --- a/python/paddle/v2/fluid/data_feeder.py +++ b/python/paddle/v2/fluid/data_feeder.py @@ -3,7 +3,7 @@ import core import numpy import six.moves as six -from framework import Variable +from framework import Variable, default_main_program __all__ = ['DataFeeder'] @@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object): class DataFeeder(object): - def __init__(self, feed_list, place): + def __init__(self, feed_list, place, program=None): self.feed_dtypes = [] self.feed_names = [] self.feed_shapes = [] self.feed_lod_level = [] + if program is None: + program = default_main_program() for each_var in feed_list: + if isinstance(each_var, basestring): + each_var = program.block(0).var(each_var) if not isinstance(each_var, Variable): raise TypeError("Feed list should contain a list of variable") self.feed_dtypes.append(each_var.dtype) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index dbdf9a043cf0fb82921ea8f7449c8d9e4364d208..b66a8bce5f4f15539007876c113afd3f878b00bc 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -846,9 +846,11 @@ class Program(object): self.sync_with_cpp() return param_to_grad_info - def create_block(self): + def create_block(self, parent_idx=None): new_block_idx = len(self.blocks) - self.desc.append_block(self.current_block().desc) + parent = self.current_block() if parent_idx is None else self.block( + parent_idx) + self.desc.append_block(parent.desc) self.current_block_idx = new_block_idx self.blocks.append(Block(self, self.current_block_idx)) return self.current_block() diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index 69a732fc45a1946f260cdd9a9c2da150b87c3ddd..c47ce82aba7fa5ac42ac26cd25fa3ebc93e96cb2 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -188,7 +188,7 @@ def save_inference_model(dirname, raise ValueError("'feed_var_names' should be a list of str.") if isinstance(target_vars, Variable): - feeded_var_names = [feeded_var_names] + target_vars = [target_vars] else: if not (bool(target_vars) and all( isinstance(var, Variable) for var in target_vars)): diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 2a462ee6cb47496b17a8f584e56ac2c8934b319a..55b35ad543b2350915574d7f9b4ef460156ee459 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -270,6 +270,7 @@ def gru_unit(input, attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) # create bias + if bias is None: bias_size = [1, 3 * size] bias = helper.create_parameter( @@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs): def cross_entropy(input, label, **kwargs): """ - This function computes cross_entropy using the input and label. + **Cross Entropy Layer** + + This layer computes the cross entropy between `input` and `label`. It supports + both standard cross-entropy and soft-label cross-entropy loss computation. + + 1) One-hot cross-entropy: + `soft_label = False`, `Label[i, 0]` indicates the class index for sample i: + + .. math:: + + Y[i] = -\log(X[i, Label[i]]) + + 2) Soft-label cross-entropy: + `soft_label = True`, `Label[i, j]` indicates the soft label of class j + for sample i: + + .. math:: + + Y[i] = \sum_j{-Label[i, j] * log(X[i, j])} + + Please make sure that in this case the summation of each row of `label` + equals one. + + 3) One-hot cross-entropy with vecterized `label`: + As a special case of 2), when each row of 'label' has only one + non-zero element which is equal to 1, soft-label cross-entropy degenerates + to a one-hot cross-entropy with one-hot label representation. + + Args: + input (Variable|list): a 2-D tensor with shape [N x D], where N is the + batch size and D is the number of classes. This input is a probability + computed by the previous operator, which is almost always the result + of a softmax operator. + label (Variable|list): the ground truth which is a 2-D tensor. When + `soft_label` is set to `False`, `label` is a tensor with shape + [N x 1]. When `soft_label` is set to `True`, `label` is a + tensor with shape [N x D]. + soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate + the given labels as soft labels, default `False`. + + Returns: + A 2-D tensor with shape [N x 1], the cross entropy loss. + + Raises: + `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ + `soft_label == True`, and the 2nd dimension of `input` and `label` are not \ + equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. + + Examples: + .. code-block:: python + + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) """ helper = LayerHelper('cross_entropy', **kwargs) out = helper.create_tmp_variable(dtype=input.dtype) @@ -514,14 +567,83 @@ def conv2d(input, groups=None, param_attr=None, bias_attr=None, - act=None, - name=None): + act=None): """ - This function creates the op for a 2-dimensional Convolution. - This is performed using the parameters of filters(size, dimensionality etc) - , stride and other configurations for a Convolution operation. - This funciton can also append an activation on top of the - conv-2d output, if mentioned in the input parameters. + **Convlution2D Layer** + + The convolution2D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output) + are in NCHW format. Where N is batch size, C is the number of channels, H is the height + of the feature, and W is the width of the feature. + The details of convolution layer, please refer UFLDL's `convolution, + `_ . + If bias attribution and activation type are provided, bias is added to the output of the convolution, + and the corresponding activation function is applied to the final result. + For each input :math:`X`, the equation is: + + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCHW format. + * :math:`W`: Filter value, a tensor with MCHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + .. math:: + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + Args: + input(Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + groups(int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + act(str): Activation type. Default: None + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") """ if stride is None: diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index c56a531ed531cf0219e94854ba66c7399e003292..ff3e5315a2c2b115e4ba563f60de4139f248e93a 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -1,7 +1,7 @@ from collections import defaultdict import framework -from backward import append_backward_ops +from backward import append_backward from framework import unique_name, program_guard from initializer import Constant from layer_helper import LayerHelper @@ -194,10 +194,10 @@ class Optimizer(object): no_grad_set=None): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward_ops(loss, parameter_list, no_grad_set) + params_grads = append_backward(loss, parameter_list, no_grad_set) params_grads = append_gradient_clip_ops(params_grads) diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index 8dbfbd547a6677517f028997e6269709aac43b67..b77d2b1268f27c5ec3c34839aaad9b75f0132c2e 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -4,7 +4,7 @@ import random import itertools import paddle.v2.fluid.core as core import collections -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.op import Operator from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.framework import Program, OpProtoHolder @@ -491,7 +491,7 @@ class OpTest(unittest.TestCase): op_loss.desc.infer_var_type(block.desc) op_loss.desc.infer_shape(block.desc) - param_grad_list = append_backward_ops( + param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) feed_dict = { diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py index f6120aedecf1015c279b8f218f5e37f2e598ab91..01321de8eac34d562d99726b1f4125d1932ab40f 100644 --- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py +++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase): total_sum = layers.sums(input=[a_sum, x_sum]) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) - append_backward_ops(total_sum_scaled) + append_backward(total_sum_scaled) g_vars = map(default_main_program().global_block().var, [each_x.name + "@GRAD" for each_x in x]) diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py index 2b9d8f351a2836cd723d629d4790de1e068d0ea3..7d815123f3454d1457f59202219f9a93bf3d8c31 100644 --- a/python/paddle/v2/fluid/tests/test_conditional_block.py +++ b/python/paddle/v2/fluid/tests/test_conditional_block.py @@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase): outs = exe.run(feed={'X': x}, fetch_list=[out])[0] print outs loss = layers.mean(x=out) - append_backward_ops(loss=loss) + append_backward(loss=loss) outs = exe.run( feed={'X': x}, fetch_list=[ diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py index 5fdabbcf889448114ac4e55e7944cb6c57ba5f3c..c552cb033f1ec8f5843490083edee7b2762b5703 100644 --- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py +++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py @@ -4,7 +4,7 @@ import numpy import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): mean = layers.mean(x=result) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 29694be58bce0eb41b05439da35ef07a542ef12a..1eadb7d912629024ee21e30b0a5fa4910bb96e06 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestOptimizer(unittest.TestCase): @@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): learning_rate = 0.01 momentum_optimizer = self.MockMomentum( learning_rate=learning_rate, momentum=0.2, use_nesterov=True) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 adagrad_optimizer = self.MockAdagrad( learning_rate=learning_rate, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, @@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase): learning_rate = 0.01 adam_optimizer = self.MockAdam( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, @@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase): learning_rate = 0.01 adamax_optimizer = self.MockAdamax( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, @@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 decayed_adagrad_optimizer = self.MockDecayedAdagrad( learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) opts = decayed_adagrad_optimizer.create_optimization_pass( diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py index e38c763ddbcc5c8410f41d062c05499333a3ee55..84f4e36fa7312fbcb96cc66ff26e234c3016df30 100644 --- a/python/paddle/v2/fluid/tests/test_recurrent_op.py +++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program, grad_var_name from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core @@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase): def test_backward(self): self.check_forward() - append_backward_ops(self.output) + append_backward(self.output) ana_grad = [np.array(x) for x in self.backward()] diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py index 24baf55e90c98f39bab926e8c85a791eee5ed4a4..890c881a126a32344128652691c6cad45e02e82d 100644 --- a/python/paddle/v2/fluid/tests/test_regularizer.py +++ b/python/paddle/v2/fluid/tests/test_regularizer.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.regularizer as regularizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestL2DecayRegularizer(unittest.TestCase): @@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) @@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py index 8f5774835e02191a068e86ea56f3f877c464a391..7c136f6360ce73a7c532b5486e544796e6853bcb 100644 --- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py +++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py @@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase): new_dat = fluid.layers.reorder_lod_tensor_by_rank( x=dat, rank_table=table) loss = fluid.layers.mean(x=new_dat) - fluid.backward.append_backward_ops(loss=loss) + fluid.backward.append_backward(loss=loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py index 9999165ed509aa40f31f26aa676f381561bd0016..d1bb20f37a3785f70bee072b9df282bba4012c16 100644 --- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py +++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py @@ -2,7 +2,7 @@ import unittest from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py index 86db4c64b493d94cc675ed4bcee7e2925fef1977..be1588fc2d09fa58882425eb3d080ef1560ebc79 100644 --- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py +++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase): self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2])) mem3_mean = layers.mean(x=mem3) - append_backward_ops(loss=mem3_mean) + append_backward(loss=mem3_mean) x_grad = exe.run( feed={'x': tensor}, fetch_list=[main_program.global_block().var('x@GRAD')])[0] diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py index 8cdd59ff3cc7deb57252fc5218d239f86016cb9c..2e4defd55d75c2012f39bea30a6c4de12528e77c 100644 --- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py @@ -4,7 +4,7 @@ import numpy as np import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): in_true=out_true, in_false=out_false, mask=y, x=x, level=level) mean = layers.mean(x=out) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py index 033b03a4957131e1155c61e8ed2f10eefb23fda4..7c5593cc5e5a66d4ccb237e3706ff3e544adf033 100644 --- a/python/paddle/v2/fluid/tests/test_while_op.py +++ b/python/paddle/v2/fluid/tests/test_while_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.core as core -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase): sum_result = layers.array_read(array=mem_array, i=i) loss = layers.mean(x=sum_result) - append_backward_ops(loss) + append_backward(loss) cpu = core.CPUPlace() exe = Executor(cpu)