diff --git a/Dockerfile b/Dockerfile index 164fe84904947bfc3cf71132b5fba04744460b26..ea39efd00bb5c0a7deb3f6d57083d83a673b883c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,7 +70,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip install --upgrade pip==9.0.3 && \ +RUN easy_install -U pip && \ pip install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md index 8266dec3c6125a09b90ac0ccd4aa5464f5c7db31..198a05a79e19227e90eaafe116217a164cd51a7d 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_cn.md +++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md @@ -1,3 +1,5 @@ +# CPU性能调优 + 此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 @@ -8,7 +10,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大 * Python 与 C++ 混合代码的性能分析 -# Python代码的性能分析 +## Python代码的性能分析 ### 生成性能分析文件 diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md index e95556dd608b7ff0a3eb18873df0015a2da94e7c..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_en.md +++ b/doc/fluid/howto/optimization/cpu_profiling_en.md @@ -1,3 +1,5 @@ +# Tune CPU performance + This tutorial introduces techniques we use to profile and tune the CPU performance of PaddlePaddle. We will use Python packages `cProfile` and `yep`, and Google's `perftools`. @@ -14,7 +16,7 @@ the profiling and tuning of 1. the Python code and 1. the mixture of Python and C++ code. -# Profiling the Python Code +## Profiling the Python Code ### Generate the Performance Profiling File diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index b3d882743785e8ee301b71b696230531d2b7ba58..aa1dc6ee2cc9a3a528e54ce2da07746158735f56 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -37,12 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具 :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" :widths: 1, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index 1e409d86b9775094998f72f92954f4bbc1013ea1..a70821eb487be841060e6b5f7fc8b014634ac5ba 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -40,12 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star :header: "version", "cp27-cp27mu", "cp27-cp27m" :widths: 1, 3, 3 - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index c0523f3c795b103c0c27081ec5dc717f6a0f11e0..5a57ec20585c26dbcd4251464718fc819148a7a5 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -91,6 +91,12 @@ void TransDataType(const OpKernelType& kernel_type_for_var, case proto::VarType::BOOL: framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); break; + case proto::VarType::INT16: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::UINT8: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; default: PADDLE_THROW("Not support type %d", src_type); } diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 45bad58145a1144dfabdd3e15b38d972d57b105e..6b0c0a6b9fb29e641449f0c21109611cccd4e5a9 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -98,7 +98,7 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op, return false; }; - if (op.Type() == "split") { + if (op.Type() == "split" || op.Type() == "split_byref") { return checker(op.OutputArgumentNames(), send_op->InputArgumentNames()); } else if (op.Type() == "concat") { return checker(op.InputArgumentNames(), send_op->OutputArgumentNames()); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index e33eb7f170ea04b998283b24d9210a9bb7538675..cc1589514aab3b973b4909159748bc4223cdce46 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -149,7 +149,7 @@ void TestInference(const std::string& dirname, state = paddle::platform::ProfilerState::kCPU; } else { #ifdef PADDLE_WITH_CUDA - state = paddle::platform::ProfilerState::kCUDA; + state = paddle::platform::ProfilerState::kAll; // The default device_id of paddle::platform::CUDAPlace is 0. // Users can get the device_id using: // int device_id = place.GetDeviceId(); @@ -172,7 +172,7 @@ void TestInference(const std::string& dirname, } // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, - "load_program_profiler.txt"); + "load_program_profiler"); paddle::platform::ResetProfiler(); // 3. Get the feed_target_names and fetch_target_names @@ -236,8 +236,7 @@ void TestInference(const std::string& dirname, // Disable the profiler and print the timing information paddle::platform::DisableProfiler( - paddle::platform::EventSortingKey::kDefault, - "run_inference_profiler.txt"); + paddle::platform::EventSortingKey::kDefault, "run_inference_profiler"); paddle::platform::ResetProfiler(); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ac1f3f44ae8703c3e0c792bd9a2e658f1341ec15..7fce138e3f47e0eb485afb4d5a665eb41f68e286 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -204,6 +204,7 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor) if(WITH_GPU) + set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h index 9b51db8a45186c2a90cf8b2eb7966d0aaea04028..46bc4f6f936929050276e8b3b93f1eddd62ac638 100644 --- a/paddle/fluid/operators/beam_search_op.h +++ b/paddle/fluid/operators/beam_search_op.h @@ -14,10 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_TESTING -#include "gtest/gtest.h" -#endif - #include #include #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index d3f3ad92442cafdd8d4cdc396d89721863d069c2..29b73951bbddd9bfd73c932d7801797590de5e8e 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,45 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/is_empty_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { -constexpr char kInput[] = "X"; -constexpr char kOutput[] = "Out"; - -class IsEmptyOp : public framework::OperatorBase { +class IsEmptyOp : public framework::OperatorWithKernel { public: - IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} + using framework::OperatorWithKernel::OperatorWithKernel; - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // get input - auto *var = scope.FindVar(Input(kInput)); - PADDLE_ENFORCE_NOT_NULL(var); - auto &tensor = var->Get(); - // get output - auto *out = scope.FindVar(Output(kOutput)); - PADDLE_ENFORCE_NOT_NULL(out); - auto *out_tensor = out->GetMutable(); + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IsEmptyOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of IsEmptyOp should not be null."); + ctx->SetOutputDim("Out", {1}); + } - out_tensor->Resize({1}); - out_tensor->mutable_data(platform::CPUPlace())[0] = - framework::product(tensor.dims()) == 0; + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + return kt; } }; -class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker { +class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput(kInput, "(Tensor) Tensor which is to be checked."); - AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not."); + AddInput("X", "(LoDTensor) Tensor which is to be checked."); + AddOutput("Out", + "(LoDTensor) a boolean Tensor that indicate empty or not."); AddComment(R"DOC( IsEmpty Operator which checks whether a tensor is empty. @@ -62,5 +58,12 @@ It will just return product(tensor.ddims()) > 0; } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp, - paddle::operators::IsEmptyOpProtoMaker); +namespace ops = paddle::operators; + +REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + is_empty, ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel, + ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3e3af22fa8d842b6a1e67418446f1a40949e046b --- /dev/null +++ b/paddle/fluid/operators/is_empty_op.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +class IsEmptyOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get input + auto* input_tensor = context.Input("X"); + // get output + auto* output_tensor = context.Output("Out"); + + output_tensor->mutable_data(platform::CPUPlace())[0] = + framework::product(input_tensor->dims()) == 0; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index 63eaaedcd5fc3df17902511dc02b25bf43ccd241..60e936298defe7c6ce8a33bdc7de05b52eb950e7 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -18,6 +18,26 @@ limitations under the License. */ namespace paddle { namespace operators { +using mkldnn::memory; // Note: paddle has also "memory" namespace +using mkldnn::pooling_forward; +using mkldnn::pooling_backward; + +// Generate keys for storing/retriving primitives for this operator +// TODO(jczaja): Make hashing function more optimial +static std::string gethash(memory::dims& input_dims, std::string& pooling_type, + std::vector& ksize, std::vector& strides, + std::vector& paddings, std::string suffix) { + auto dims2str = [](memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) + + dims2str(paddings) + pooling_type + suffix; +} + template class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -34,10 +54,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // Get an unique name from "argument" name of "Out" variable // This name will be used as key when saving info into device context - const std::string key = ctx.op().Output("Out"); - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -63,37 +79,71 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, - pooling_type, mkldnn_engine); - - // save pool_pd into global device context to be referred in backward path - dev_ctx.SetBlob(key_pool_pd, pool_pd); - - std::shared_ptr workspace_memory = - CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); - - // save pool_workspace_memory to be referred in backward path - dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - auto dst_memory = - mkldnn::memory({dst_md, mkldnn_engine}, - static_cast(const_cast(output_data))); + const std::string key = gethash(src_tz, pooling_type, ksize, strides, + paddings, ctx.op().Output("Out")); + const std::string key_pool_p = key + "@pool_p"; + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; - auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, - *workspace_memory); + auto pool_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); + if (pool_p == nullptr) { + // TODO(pzelazko-intel): support more formats + + auto src_md = + platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), + mkldnn::memory::format::nchw); + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType(), + mkldnn::memory::format::nchw); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, + pooling_type, mkldnn_engine); + + // save pool_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_pool_pd, pool_pd); + + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + auto pool_src_memory_p = std::make_shared( + memory::primitive_desc{src_md, mkldnn_engine}, + static_cast(const_cast(input_data))); + dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); + + auto pool_dst_memory_p = std::make_shared( + memory::primitive_desc{dst_md, mkldnn_engine}, + static_cast(output_data)); + dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); + + pool_p = std::make_shared( + *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()), + *workspace_memory); + dev_ctx.SetBlob(key_pool_p, pool_p); + } else { + // Primitives already exist + auto pool_src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(pool_src_memory_p != nullptr, + "Fail to find pooling src mem_p in device context"); + auto pool_dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(pool_dst_memory_p != nullptr, + "Fail to find pooling dst mem_p in device context"); + pool_src_memory_p->set_data_handle( + reinterpret_cast(const_cast(input_data))); + pool_dst_memory_p->set_data_handle(output_data); + } // push primitive to stream and wait until it's executed - std::vector pipeline{pool_prim}; + std::vector pipeline{*(pool_p.get())}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } @@ -120,9 +170,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::primitive_desc workspace_md = pooling_type == "max" ? pool_pd->workspace_primitive_desc() - : mkldnn::memory::primitive_desc( - {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw}, - engine); + : mkldnn::memory::primitive_desc({{}, + platform::MKLDNNGetDataType(), + mkldnn::memory::format::nchw}, + engine); auto p_workspace_memory = new mkldnn::memory(workspace_md); return std::unique_ptr(p_workspace_memory); @@ -140,13 +191,6 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); - // Get an unique name from "argument" name of "Out" variable - // This name will be used as key when referring info from device context - const std::string key = ctx.op().Input("Out"); - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; - std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); @@ -171,43 +215,76 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector diff_dst_tz = paddle::framework::vectorize2int(out_grad->dims()); - auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - // Retrieve pool_pd/pool_workspace_memory from device context - auto pool_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_pd)); - PADDLE_ENFORCE(pool_pd != nullptr, - "Fail to find pool_pd in device context"); - - auto workspace_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_workspace_memory)); - PADDLE_ENFORCE(workspace_memory != nullptr, - "Fail to find workspace_memory in device context"); - - auto pool_bwd_desc = mkldnn::pooling_backward::desc( - pooling_type == "max" ? mkldnn::algorithm::pooling_max - : mkldnn::algorithm::pooling_avg, - diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, - mkldnn::padding_kind::zero); - auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( - pool_bwd_desc, mkldnn_engine, *pool_pd); - - auto diff_src_memory = - mkldnn::memory({diff_src_md, mkldnn_engine}, - static_cast(const_cast(in_x_grad_data))); - auto diff_dst_memory = - mkldnn::memory({diff_dst_md, mkldnn_engine}, - static_cast(const_cast(out_grad_data))); + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when referring info from device context + const std::string key = gethash(diff_src_tz, pooling_type, ksize, strides, + paddings, ctx.op().Input("Out")); + const std::string key_pool_bwd_p = key + "@pool_bwd_p"; + const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; + const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; - auto bwd_prim = mkldnn::pooling_backward( - pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); + auto pool_bwd_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_bwd_p)); + if (pool_bwd_p == nullptr) { + auto diff_src_md = + platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), + mkldnn::memory::format::nchw); + auto diff_dst_md = + platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType(), + mkldnn::memory::format::nchw); + // Retrieve pool_pd/pool_workspace_memory from device context + auto pool_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_pd)); + PADDLE_ENFORCE(pool_pd != nullptr, + "Fail to find pool_pd in device context"); + + auto workspace_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_workspace_memory)); + PADDLE_ENFORCE(workspace_memory != nullptr, + "Fail to find workspace_memory in device context"); + + auto pool_diff_src_memory_p = std::make_shared(memory( + {diff_src_md, mkldnn_engine}, static_cast(in_x_grad_data))); + dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); + + auto pool_diff_dst_memory_p = std::make_shared( + memory({diff_dst_md, mkldnn_engine}, + static_cast(const_cast(out_grad_data)))); + dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p); + + auto pool_bwd_desc = mkldnn::pooling_backward::desc( + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); + auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( + pool_bwd_desc, mkldnn_engine, *pool_pd); + + pool_bwd_p = std::make_shared( + pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, + *(pool_diff_src_memory_p)); + dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); + } else { + // Primitives already exist + auto pool_diff_src_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_diff_src_mem_p)); + PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, + "Fail to find pooling src mem_p in device context"); + auto pool_diff_dst_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); + PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, + "Fail to find pooling dst mem_p in device context"); + pool_diff_src_memory_p->set_data_handle( + reinterpret_cast(in_x_grad_data)); + pool_diff_dst_memory_p->set_data_handle(const_cast(out_grad_data)); + } // push primitive to stream and wait until it's executed - std::vector pipeline{bwd_prim}; + std::vector pipeline{*(pool_bwd_p.get())}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } // Compute() }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 79e3c26fef51b4d27520a8079de1074d72f89617..b29035bafd34fa81dc6b59691142fe74439202b8 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) -cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS}) +cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 23f1d615daab91f0e4b353bc7d9a3ca7f5cec5ae..56ed5912a15437b72b769610912c7493d77e5964 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -71,5 +71,15 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { return use_mkldnn && platform::is_cpu_place(ctx.GetPlace()); } +template +mkldnn::memory::data_type MKLDNNGetDataType() { + return mkldnn::memory::data_undef; +} + +template <> +inline mkldnn::memory::data_type MKLDNNGetDataType() { + return mkldnn::memory::f32; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 50bc0aba6aa0f056dc0b2d49f6b3b745433e0756..2fb5c6dc6b8ad25fa1ad5fcf7c2acfedd5be4a83 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { } RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) - : start_ns_(PosixInNsec()) { + : is_enabled_(false), start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; PushEvent(name_, dev_ctx_); @@ -183,7 +184,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), @@ -193,14 +194,16 @@ RecordEvent::~RecordEvent() { PopEvent(name_, dev_ctx_); } -RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) { +RecordBlock::RecordBlock(int block_id) + : is_enabled_(false), start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; + is_enabled_ = true; SetCurBlock(block_id); name_ = string::Sprintf("block_%d", block_id); } RecordBlock::~RecordBlock() { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { // We try to put all blocks at the same nested depth in the diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 61b98143e41abb9e47d2c717c7876f1bab7f5077..643bb6183d144ec11a4890d9ea1ca970acb08b4c 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -74,6 +74,7 @@ struct RecordEvent { ~RecordEvent(); + bool is_enabled_; uint64_t start_ns_; // The device context is used by Event to get the current cuda stream. const DeviceContext* dev_ctx_; @@ -89,6 +90,7 @@ struct RecordBlock { ~RecordBlock(); private: + bool is_enabled_; std::string name_; uint64_t start_ns_; }; diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 6471eb3ab7bf05365c0bb2bf68bb74ef9044c527..bcf6d4dd3087060c016e53722cde80704ef2e834 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -238,6 +238,7 @@ void BindVarDsec(pybind11::module *m) { pybind11::enum_(var_desc, "VarType", "") .value("BOOL", pd::proto::VarType::BOOL) + .value("UINT8", pd::proto::VarType::UINT8) .value("INT16", pd::proto::VarType::INT16) .value("INT32", pd::proto::VarType::INT32) .value("INT64", pd::proto::VarType::INT64) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 7e00bd38487902227c3b4521db20cdbe314059be..92b8b90880bc78dbc281a959a7472c2822f76fc3 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -198,7 +198,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \ + apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e5af42ed5703f2b277cb6e8b5c81b49656107ecf..fbe219a1c9cf85f19ae2ab991ae7e4207858f204 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -405,17 +405,19 @@ EOF function gen_dockerfile() { # Set BASE_IMAGE according to env variables + CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)" + CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1) if [[ ${WITH_GPU} == "ON" ]]; then - BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04" + BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04" else - BASE_IMAGE="ubuntu:16.04" + BASE_IMAGE="ubuntu:16.04" fi DOCKERFILE_GPU_ENV="" DOCKERFILE_CUDNN_DSO="" if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}" - DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so" + DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so" fi cat < 0 and max_sizes[0] > 0: + attrs['max_sizes'] = max_sizes + + box = helper.create_tmp_variable(dtype) + var = helper.create_tmp_variable(dtype) + helper.append_op( + type="prior_box", + inputs={"Input": input, + "Image": image}, + outputs={"Boxes": box, + "Variances": var}, + attrs=attrs, ) + box.stop_gradient = True + var.stop_gradient = True + return box, var + + def multi_box_head(inputs, image, base_size, @@ -660,47 +753,6 @@ def multi_box_head(inputs, clip=True) """ - def _prior_box_(input, - image, - min_sizes, - max_sizes, - aspect_ratios, - variance, - flip=False, - clip=False, - step_w=0.0, - step_h=0.0, - offset=0.5, - name=None): - helper = LayerHelper("prior_box", **locals()) - dtype = helper.input_dtype() - - attrs = { - 'min_sizes': min_sizes, - 'aspect_ratios': aspect_ratios, - 'variances': variance, - 'flip': flip, - 'clip': clip, - 'step_w': step_w, - 'step_h': step_h, - 'offset': offset - } - if len(max_sizes) > 0 and max_sizes[0] > 0: - attrs['max_sizes'] = max_sizes - - box = helper.create_tmp_variable(dtype) - var = helper.create_tmp_variable(dtype) - helper.append_op( - type="prior_box", - inputs={"Input": input, - "Image": image}, - outputs={"Boxes": box, - "Variances": var}, - attrs=attrs, ) - box.stop_gradient = True - var.stop_gradient = True - return box, var - def _reshape_with_axis_(input, axis=1): if not (axis > 0 and axis < len(input.shape)): raise ValueError("The axis should be smaller than " @@ -777,11 +829,10 @@ def multi_box_head(inputs, aspect_ratio = aspect_ratios[i] if not _is_list_or_tuple_(aspect_ratio): aspect_ratio = [aspect_ratio] + step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0] - box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio, - variance, flip, clip, step_w[i] - if step_w else 0.0, step_h[i] - if step_w else 0.0, offset) + box, var = prior_box(input, image, min_size, max_size, aspect_ratio, + variance, flip, clip, step, offset) box_results.append(box) var_results.append(var) diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt index c2a15bdb3b17b65fe861dd429f548074c13e2f09..da76747f82d1ab51af07c2e942d1ea893e149b7e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt @@ -8,3 +8,4 @@ endforeach() add_subdirectory(fit_a_line) add_subdirectory(recognize_digits) +add_subdirectory(image_classification) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py new file mode 100644 index 0000000000000000000000000000000000000000..7fed6d914f75b690e34411aa154359c93b6ca989 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py @@ -0,0 +1,82 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +CIFAR dataset. + +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. + +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. + +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. + +""" + +import cPickle +import itertools +import numpy +import paddle.v2.dataset.common +import tarfile + +__all__ = ['train10'] + +URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' +CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' +CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a' + + +def reader_creator(filename, sub_name, batch_size=None): + def read_batch(batch): + data = batch['data'] + labels = batch.get('labels', batch.get('fine_labels', None)) + assert labels is not None + for sample, label in itertools.izip(data, labels): + yield (sample / 255.0).astype(numpy.float32), int(label) + + def reader(): + with tarfile.open(filename, mode='r') as f: + names = (each_item.name for each_item in f + if sub_name in each_item.name) + + batch_count = 0 + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + if isinstance(batch_size, int) and batch_count > batch_size: + break + batch_count += 1 + yield item + + return reader + + +def train10(batch_size=None): + """ + CIFAR-10 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + 'data_batch', + batch_size=batch_size) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py similarity index 77% rename from python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index 17db38797cf19ae387f69f66daa42fc78cfcb7d5..1160e500dbd6db784eeb81b72968386347fec59a 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -17,6 +17,7 @@ from __future__ import print_function import paddle import paddle.fluid as fluid import numpy +import cifar10_small_test_set def resnet_cifar10(input, depth=32): @@ -81,46 +82,50 @@ def train_network(): cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) accuracy = fluid.layers.accuracy(input=predict, label=label) - return avg_cost, accuracy + return [avg_cost, accuracy] -def train(use_cuda, save_path): +def train(use_cuda, train_program, save_dirname): BATCH_SIZE = 128 EPOCH_NUM = 1 train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=128 * 10), + cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), batch_size=BATCH_SIZE) test_reader = paddle.batch( paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) def event_handler(event): - if isinstance(event, fluid.EndIteration): - if (event.batch_id % 10) == 0: - avg_cost, accuracy = trainer.test(reader=test_reader) + if isinstance(event, fluid.EndStepEvent): + avg_cost, accuracy = trainer.test( + reader=test_reader, feed_order=['pixel', 'label']) - print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format( - event.batch_id + 1, avg_cost, accuracy)) + print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy)) - if accuracy > 0.01: # Low threshold for speeding up CI - trainer.params.save(save_path) - return + if accuracy > 0.01: # Low threshold for speeding up CI + if save_dirname is not None: + trainer.save_params(save_dirname) + return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_network, + train_func=train_program, optimizer=fluid.optimizer.Adam(learning_rate=0.001), - place=place, - event_handler=event_handler) - trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler) + place=place) + trainer.train( + reader=train_reader, + num_epochs=EPOCH_NUM, + event_handler=event_handler, + feed_order=['pixel', 'label']) -def infer(use_cuda, save_path): - params = fluid.Params(save_path) + +def infer(use_cuda, inference_program, save_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer(inference_network, params, place=place) + inferencer = fluid.Inferencer( + infer_func=inference_program, param_path=save_dirname, place=place) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range @@ -135,8 +140,14 @@ def main(use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return save_path = "image_classification_resnet.inference.model" - train(use_cuda, save_path) - infer(use_cuda, save_path) + + train( + use_cuda=use_cuda, train_program=train_network, save_dirname=save_path) + + infer( + use_cuda=use_cuda, + inference_program=inference_network, + save_dirname=save_path) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py similarity index 72% rename from python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index e83afeed2f72635a40aa2ac21dc0c8611c309de4..1e3e955ba0299f2cc0fcc02d79ae6fd8ff4c1171 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -17,6 +17,7 @@ from __future__ import print_function import paddle import paddle.fluid as fluid import numpy +import cifar10_small_test_set def vgg16_bn_drop(input): @@ -60,46 +61,48 @@ def train_network(): cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) accuracy = fluid.layers.accuracy(input=predict, label=label) - return avg_cost, accuracy + return [avg_cost, accuracy] -def train(use_cuda, save_path): +def train(use_cuda, train_program, save_dirname): BATCH_SIZE = 128 - EPOCH_NUM = 1 - train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=128 * 10), + cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), batch_size=BATCH_SIZE) test_reader = paddle.batch( paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) def event_handler(event): - if isinstance(event, fluid.EndIteration): - if (event.batch_id % 10) == 0: - avg_cost, accuracy = trainer.test(reader=test_reader) + if isinstance(event, fluid.EndStepEvent): + avg_cost, accuracy = trainer.test( + reader=test_reader, feed_order=['pixel', 'label']) - print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format( - event.batch_id + 1, avg_cost, accuracy)) + print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy)) - if accuracy > 0.01: # Low threshold for speeding up CI - trainer.params.save(save_path) - return + if accuracy > 0.01: # Low threshold for speeding up CI + if save_dirname is not None: + trainer.save_params(save_dirname) + return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_network, - optimizer=fluid.optimizer.Adam(learning_rate=0.001), + train_func=train_program, place=place, - event_handler=event_handler) - trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler) + optimizer=fluid.optimizer.Adam(learning_rate=0.001)) + + trainer.train( + reader=train_reader, + num_epochs=1, + event_handler=event_handler, + feed_order=['pixel', 'label']) -def infer(use_cuda, save_path): - params = fluid.Params(save_path) +def infer(use_cuda, inference_program, save_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer(inference_network, params, place=place) + inferencer = fluid.Inferencer( + infer_func=inference_program, param_path=save_dirname, place=place) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range @@ -114,8 +117,14 @@ def main(use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return save_path = "image_classification_vgg.inference.model" - train(use_cuda, save_path) - infer(use_cuda, save_path) + + train( + use_cuda=use_cuda, train_program=train_network, save_dirname=save_path) + + infer( + use_cuda=use_cuda, + inference_program=inference_network, + save_dirname=save_path) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py similarity index 77% rename from python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py rename to python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py index 6d7495e4dd1cd4ae1486fb9a6dc5264bb26a5940..bf86cd9acf8da940fcc2fb5b594e33f9b6965acb 100644 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py @@ -90,7 +90,7 @@ def train_program(is_sparse): return avg_cost -def train(use_cuda, train_program, save_path): +def train(use_cuda, train_program, save_dirname): train_reader = paddle.batch( paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) test_reader = paddle.batch( @@ -99,27 +99,36 @@ def train(use_cuda, train_program, save_path): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() def event_handler(event): - if isinstance(event, fluid.EndEpochEvent): - outs = trainer.test(reader=test_reader) + if isinstance(event, fluid.EndStepEvent): + outs = trainer.test( + reader=test_reader, + feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw']) avg_cost = outs[0] print("loss= ", avg_cost) - if avg_cost < 5.0: - trainer.save_params(save_path) - return + if avg_cost < 10.0: + trainer.save_params(save_dirname) + trainer.stop() + if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") trainer = fluid.Trainer( - train_program, fluid.optimizer.SGD(learning_rate=0.001), place=place) + train_func=train_program, + optimizer=fluid.optimizer.SGD(learning_rate=0.001), + place=place) + trainer.train( - reader=train_reader, num_epochs=1, event_handler=event_handler) + reader=train_reader, + num_epochs=1, + event_handler=event_handler, + feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw']) -def infer(use_cuda, inference_program, save_path): +def infer(use_cuda, inference_program, save_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=save_path, place=place) + infer_func=inference_program, param_path=save_dirname, place=place) lod = [0, 1] first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) @@ -127,12 +136,14 @@ def infer(use_cuda, inference_program, save_path): third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) - result = inferencer.infer({ - 'firstw': first_word, - 'secondw': second_word, - 'thirdw': third_word, - 'forthw': fourth_word - }) + result = inferencer.infer( + { + 'firstw': first_word, + 'secondw': second_word, + 'thirdw': third_word, + 'forthw': fourth_word + }, + return_numpy=False) print(np.array(result[0])) @@ -140,9 +151,17 @@ def main(use_cuda, is_sparse): if use_cuda and not fluid.core.is_compiled_with_cuda(): return - save_path = "word2vec.params" - train(use_cuda, partial(train_program, is_sparse), save_path) - infer(use_cuda, partial(inference_program, is_sparse), save_path) + save_path = "word2vec.inference.model" + + train( + use_cuda=use_cuda, + train_program=partial(train_program, is_sparse), + save_dirname=save_path) + + infer( + use_cuda=use_cuda, + inference_program=partial(inference_program, is_sparse), + save_dirname=save_path) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 921260ef3f4b1f9e4c65b3ffb440dc34cb0a9376..8569d838bdd414eb84c6c87674990a25a2fdcdf9 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -109,6 +109,24 @@ class TestDetection(unittest.TestCase): print(str(program)) +class TestPriorBox(unittest.TestCase): + def test_prior_box(self): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.prior_box( + input=conv1, + image=images, + min_sizes=[100.0], + aspect_ratios=[1.], + flip=True, + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[3] == 4 + + class TestMultiBoxHead(unittest.TestCase): def test_multi_box_head(self): data_shape = [3, 224, 224] diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py index 4d11cf226be2ba4ffbe015198fed3191f1e02f72..11121d9b65351eab639b7618fac0e54714cf4680 100644 --- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py +++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py @@ -14,42 +14,24 @@ import unittest import numpy as np -from paddle.fluid.op import Operator -import paddle.fluid.core as core +from op_test import OpTest -def create_tensor(scope, name, np_data): - tensor = scope.var(name).get_tensor() - tensor.set_dims(np_data.shape) - tensor.set(np_data, core.CPUPlace()) - return tensor - - -class TestIsEmptyOp(unittest.TestCase): +class TestEmpty(OpTest): def setUp(self): - self.scope = core.Scope() - # create input variables - np_data0 = np.array([0, 1, 2]) - create_tensor(self.scope, "X0", np_data0) - - np_data1 = np.array([1]) - t = create_tensor(self.scope, "X1", np_data1) - t.set_dims([0]) + self.op_type = "is_empty" + self.inputs = {'X': np.array([1, 2, 3])} + self.outputs = {'Out': np.array([False])} - # create output variables - self.scope.var("out") + def test_check_output(self): + self.check_output() - def test_no_empty(self): - self.one_case("X0", False) - def test_empty(self): - self.one_case("X1", True) - - def one_case(self, input, target): - op = Operator(type="is_empty", X=input, Out="out") - op.run(self.scope, core.CPUPlace()) - out = self.scope.var("out").get_tensor() - self.assertEqual(np.array(out)[0], target) +class TestNotEmpty(TestEmpty): + def setUp(self): + self.op_type = "is_empty" + self.inputs = {'X': np.array([])} + self.outputs = {'Out': np.array([True])} if __name__ == "__main__": diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 49034b47b2d184e4027bcebc29413a163340fdaa..80a8f7c09cfe521f8f94a27e85fc8d86c02b3e97 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -24,7 +24,8 @@ dtype_to_size = { core.VarDesc.VarType.INT16: 2, core.VarDesc.VarType.INT32: 4, core.VarDesc.VarType.INT64: 8, - core.VarDesc.VarType.BOOL: 1 + core.VarDesc.VarType.BOOL: 1, + core.VarDesc.VarType.UINT8: 1, } SUB_BLOCK_OPS = [