diff --git a/Dockerfile b/Dockerfile index 42a103240e882b2732f14619308cc00f010d20af..b92ac228a8d50da93f8c0bfe2f6af31fc784f2c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ARG WITH_GPU ARG WITH_AVX -ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} @@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # version util jupyter fixes this issue. -# specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# version(1.7.1 for now), which causes building documentation failed. + RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ - pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ - pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ - pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ - pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ - pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ @@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker -RUN pip3 --no-cache-dir install coverage -RUN pip3.6 --no-cache-dir install coverage -RUN pip3.7 --no-cache-dir install coverage +RUN pip3 --no-cache-dir install coverage +RUN pip3.6 --no-cache-dir install coverage +RUN pip3.7 --no-cache-dir install coverage RUN pip --no-cache-dir install coverage COPY ./python/requirements.txt /root/ @@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure] -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) # ar mishandles 4GB files # https://sourceware.org/bugzilla/show_bug.cgi?id=14625 diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 5a889dbc3143833ff48a972d17efc0aaf63f1810..3e0a3bf8f5ba9e1570295dc4b85297ddea7c4a14 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST) SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + if (WITH_CRYPTO) set(dst_dir "${DST}/third_party/install/cryptopp") copy(${TARGET} - SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib) + SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + endif() set(dst_dir "${DST}/third_party/install/xxhash") copy(${TARGET} diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 0bd6a79b55392e8bfb8f33b0a29b4cf1df0d44dc..5574a55e18c6d9806cb878dc69ec597f81da97d8 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item, TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { - dst_item->ShareDataWith(src_item); + TensorCopy(src_item, platform::CPUPlace(), dst_item); } } else { dst_item->clear(); diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 4b984210ed18d9f51d9485616d1c28871d936237..551d1342edeb335d1cad4782f85ae9f94f8739bd 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -113,7 +113,9 @@ message DistributedStrategy { optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; - // optional bool enable_backward_optimizer_op_deps = 19 [ default = true ]; + optional bool cudnn_exhaustive_search = 21 [ default = true ]; + optional int32 conv_workspace_size_limit = 22 [ default = 4000 ]; + optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index 0ffa38a037ea431c8c860da8db14c799d1524c0d..2a85c60305bd36e78c071f5703885c23e33b403e 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -29,14 +29,20 @@ namespace framework { namespace compatible { struct OpUpdateRecord { - enum class Type { kInvalid = 0, kModifyAttr, kNewAttr }; + enum class Type { + kInvalid = 0, + kModifyAttr, + kNewAttr, + kNewInput, + kNewOutput + }; Type type_; std::string remark_; }; struct ModifyAttr : OpUpdateRecord { ModifyAttr(const std::string& name, const std::string& remark, - boost::any default_value) + const boost::any& default_value) : OpUpdateRecord({Type::kModifyAttr, remark}), name_(name), default_value_(default_value) { @@ -47,9 +53,10 @@ struct ModifyAttr : OpUpdateRecord { std::string name_; boost::any default_value_; }; + struct NewAttr : OpUpdateRecord { NewAttr(const std::string& name, const std::string& remark, - boost::any default_value) + const boost::any& default_value) : OpUpdateRecord({Type::kNewAttr, remark}), name_(name), default_value_(default_value) {} @@ -59,6 +66,22 @@ struct NewAttr : OpUpdateRecord { boost::any default_value_; }; +struct NewInput : OpUpdateRecord { + NewInput(const std::string& name, const std::string& remark) + : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {} + + private: + std::string name_; +}; + +struct NewOutput : OpUpdateRecord { + NewOutput(const std::string& name, const std::string& remark) + : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {} + + private: + std::string name_; +}; + class OpVersionDesc { public: OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark, @@ -75,6 +98,18 @@ class OpVersionDesc { return *this; } + OpVersionDesc& NewInput(const std::string& name, const std::string& remark) { + infos_.push_back(std::shared_ptr( + new compatible::NewInput(name, remark))); + return *this; + } + + OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) { + infos_.push_back(std::shared_ptr( + new compatible::NewOutput(name, remark))); + return *this; + } + private: std::vector> infos_; }; diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc index 77891dafc81b3a96af28cb480f1620543caab0b8..052bf3a4b882be749e70704f18f09a7b24551ed7 100644 --- a/paddle/fluid/framework/op_version_registry_test.cc +++ b/paddle/fluid/framework/op_version_registry_test.cc @@ -42,7 +42,14 @@ TEST(test_operator_version, test_operator_version) { "height", "In order to represent a two-dimensional rectangle, the " "parameter height is added.", - 0)); + 0)) + .AddCheckpoint( + R"ROC( + Add a input [X2] and a output [Y2] + )ROC", + framework::compatible::OpVersionDesc() + .NewInput("X2", "The second input.") + .NewOutput("Y2", "The second output.")); } } // namespace compatible } // namespace framework diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 1f2734eece578f7ec266a6f31cd46b373f010fc1..98554ed04976670c1a846cbeab69815417c0a998 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32) SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_fluid_shared ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) -elseif(NOT WIN32) - # TODO: Fix this unittest failed on Windows - inference_analysis_test(test_analyzer - SRCS analyzer_tester.cc - EXTRA_DEPS reset_tensor_array paddle_inference_api - ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) +elseif(WIN32) + inference_analysis_test(test_analyzer + SRCS analyzer_tester.cc + EXTRA_DEPS reset_tensor_array paddle_inference_api + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 30e8386f4c86e308372b5dd6328c7d3785a073b1..fb0ad31a3e612201de32813a65970c73b73b611b 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -54,8 +54,7 @@ if(WITH_TESTING) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST") - elseif(NOT WIN32) - # TODO: Fix this unittest failed on Windows + elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) @@ -67,8 +66,7 @@ endif() if (NOT APPLE AND NOT WIN32) cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) -elseif (NOT WIN32) - # TODO: Fix this unittest failed on Windows +elseif (WIN32) cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 9a3a73f6c946d76f0da1b18feb8e7d61c0bf59b6..1b79c77c69e162a6f96a1762a4949386a7dadde4 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -132,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML) set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) + set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150) else() # TODO: fix this test on MACOS and OPENBLAS, the reason is that # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS @@ -191,6 +192,7 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz") inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true) +set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150) # text_classification set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 72816f6d0317600ad6bf8ffc4ad31bd1a23d7c30..b9f979f96d4b106642795151fb8e34b025b2caef 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && - ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} + URL ${URL}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_EXTRACT 1 DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" - BUILD_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} + ${CMAKE_COMMAND} -E tar xzf ${FILENAME} UPDATE_COMMAND "" INSTALL_COMMAND "" ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") -if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}) inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu index eca8246533fea4df5a057252ce8a79aef2bfe565..7aaaa0002c5ab31af72c75e69f5a283c09633ba4 100644 --- a/paddle/fluid/operators/affine_grid_op.cu +++ b/paddle/fluid/operators/affine_grid_op.cu @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/affine_grid_op.h" #include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_info.h" + namespace paddle { namespace operators { @@ -84,14 +86,14 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h, int theta_offset = n * 6; // 2 * 3; T out_grad_x = out_grad[index * 2]; - atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor); - atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor); - atomicAdd(theta_grad + theta_offset + 2, out_grad_x); + platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor); + platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor); + platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x); T out_grad_y = out_grad[index * 2 + 1]; - atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor); - atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor); - atomicAdd(theta_grad + theta_offset + 5, out_grad_y); + platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor); + platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor); + platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y); } } diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu index 7e1e7b1e6929a6fce3a315b4e4711794bc6649b7..999f990448ca6370dadacbdaee5bf3bcadcaca0e 100644 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ b/paddle/fluid/operators/grid_sampler_op.cu @@ -31,7 +31,7 @@ static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH, int sW, int H, int W, T delta) { if (in_bounds(h, w, H, W)) { - atomicAdd(data + h * sH + w * sW, delta); + platform::CudaAtomicAdd(data + h * sH + w * sW, delta); } } diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc index 354e5c60a6b9ed80f0f8c44439294bfa2731a423..7749903e5f36f1d93f7e111da4587d6828d445a4 100644 --- a/paddle/fluid/operators/py_func_op.cc +++ b/paddle/fluid/operators/py_func_op.cc @@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable, out->set_lod(py_out_tensor->lod()); out->ShareDataWith(*py_out_tensor); } catch (py::cast_error &) { - PADDLE_THROW("The %d-th output must be LoDTensor", i); + PADDLE_THROW(platform::errors::InvalidArgument( + "The %d-th output must be LoDTensor.", i)); } } } diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..57891699fd2ad73a1cccce26438528657afdf340 --- /dev/null +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -0,0 +1,515 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/float16.h" + +// set cub base traits in order to handle float16 +namespace cub { +template <> +struct NumericTraits + : BaseTraits {}; +} // namespace cub + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +struct SegmentOffsetIter { + EIGEN_DEVICE_FUNC + explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { + return idx * num_cols_; + } + + int num_cols_; +}; + +// Iter using into a column +struct ColumnIndexIter { + explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()( + const Eigen::array& ix) const { + return ix[0] % num_cols_; + } + + int num_cols_; +}; + +inline static int GetDesiredBlockDim(int dim) { + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +template +__global__ void InitIndex(T* indices, T num_rows, T num_cols) { + int col_id = threadIdx.x; + int row_id = blockIdx.x; + + for (int64_t j = row_id; j < num_rows; j += gridDim.x) { + for (int64_t i = col_id; i < num_cols; i += blockDim.x) { + indices[j * num_cols + i] = i; + } + } +} + +template +struct Pair { + __device__ __forceinline__ Pair() {} + __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} + + __device__ __forceinline__ void set(T value, int64_t id) { + v = value; + id = id; + } + + __device__ __forceinline__ void operator=(const Pair& in) { + v = in.v; + id = in.id; + } + + __device__ __forceinline__ bool operator<(const T value) const { + return (v < value); + } + + __device__ __forceinline__ bool operator>(const T value) const { + return (v > value); + } + __device__ __forceinline__ bool operator<(const Pair& in) const { + return (v < in.v) || ((v == in.v) && (id > in.id)); + } + + __device__ __forceinline__ bool operator>(const Pair& in) const { + return (v > in.v) || ((v == in.v) && (id < in.id)); + } + + T v; + int64_t id; +}; + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, + int beam_size, const bool& largest) { + for (int k = beam_size - 2; k >= 0; k--) { + if (largest) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } else { + if (topk[k] > p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, int beam_size, + const bool& largest) { + while (idx < dim) { + if (largest) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size, largest); + } + } else { + if (topk[beam_size - 1] > src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size, largest); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, const Pair& max, + int beam_size, const bool& largest) { + while (idx < dim) { + if (largest) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + if (tmp < max) { + AddTo(topk, tmp, beam_size, largest); + } + } + } else { + if (topk[beam_size - 1] > src[idx]) { + Pair tmp(src[idx], idx); + if (tmp > max) { + AddTo(topk, tmp, beam_size, largest); + } + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, + int beam_size, const T* src, + bool* firstStep, bool* is_empty, + Pair* max, int dim, + const int tid, bool largest) { + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; + GetTopK(topk, src, tid, dim, length, largest); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - (*beam)) { + topk[k] = topk[k + *beam]; + } else { + topk[k].set(-static_cast(INFINITY), -1); + } + } + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, + length, largest); + } + } + + *max = topk[MaxLength - 1]; + if ((*max).v == -static_cast(1)) *is_empty = true; + *beam = 0; + } +} + +template +__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, + Pair topk[], T** topVal, + int64_t** topIds, int* beam, int* k, + const int tid, const int warp, + const bool& largest) { + while (true) { + __syncthreads(); + if (tid < BlockSize / 2) { + if (largest) { + if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } else { + if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } + } + __syncthreads(); + for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { + if (tid < stride) { + if (largest) { + if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } else { + if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } + } + __syncthreads(); + } + __syncthreads(); + + if (tid == 0) { + **topVal = sh_topk[maxid[0]].v; + **topIds = sh_topk[maxid[0]].id; + (*topVal)++; + (*topIds)++; + } + if (tid == maxid[0]) (*beam)++; + if (--(*k) == 0) break; + __syncthreads(); + + if (tid == maxid[0]) { + if (*beam < MaxLength) { + sh_topk[tid] = topk[*beam]; + } + } + // NOTE(zcd): temporary solution + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + + if (maxid[0] / 32 == warp) { + if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) == + MaxLength) + break; + } + } +} + +/** + * Each block compute one sample. + * In a block: + * 1. every thread get top MaxLength value; + * 2. merge to sh_topk, block reduce and get max value; + * 3. go to the second setp, until one thread's topk value is null; + * 4. go to the first setp, until get the topk value. + */ + +template +__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, + const T* src, int lds, int dim, int k, + int grid_dim, int num, bool largest = true) { + __shared__ Pair sh_topk[BlockSize]; + const int tid = threadIdx.x; + const int warp = threadIdx.x / 32; + + const int bid = blockIdx.x; + for (int i = bid; i < num; i += grid_dim) { + int top_num = k; + __shared__ int maxid[BlockSize / 2]; + T* out = output + i * output_stride; + int64_t* inds = indices + i * k; + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int j = 0; j < MaxLength; j++) { + if (largest) { + topk[j].set(-static_cast(INFINITY), -1); + } else { + topk[j].set(static_cast(INFINITY), -1); + } + } + while (top_num) { + ThreadGetTopK(topk, &beam, k, src + i * lds, + &firststep, &is_empty, &max, dim, + tid, largest); + + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &out, &inds, + &beam, &top_num, tid, warp, largest); + } + } +} + +template +__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad, + size_t rows, size_t cols, size_t k) { + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + x_grad[i * cols + j] = 0; + } + for (size_t j = 0; j < k; ++j) { + size_t idx = indices[i * k + j]; + x_grad[i * cols + idx] = out_grad[i * k + j]; + } + } +} + +// the grad assign with the axis +template +__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices, + T* grad_in, int pre, int post, + int raw_height, int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + const int& base_index = i * post * k; + const int& base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + const int64_t idx_ij = indices[base_index + j]; + const int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[idx_ij]; + } + } +} +// use the radix sort for the topk +template +bool SortTopk(const platform::CUDADeviceContext& ctx, + const framework::Tensor* input_tensor, const int64_t num_cols, + const int64_t num_rows, const int k, + framework::Tensor* out_tensor, framework::Tensor* indices_tensor, + bool largest = true) { + auto cu_stream = ctx.stream(); + + Tensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = framework::make_ddim(dims); + input_indices.Resize(dim); + // input_indices.Resize(num_rows*num_cols); + input_indices.mutable_data(ctx.GetPlace()); + size_t temp_storage_bytes = -1; + + auto ComputeBlockSize = [](int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; + }; + int block_size = ComputeBlockSize(num_cols); + + unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + // actually, int num_rows < max_grid_size + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + // Init a index array + InitIndex<<>>( + input_indices.data(), num_rows, num_cols); + + // create iter for counting input + cub::CountingInputIterator counting_iter(0); + // segment_offset is used for move to next row + cub::TransformInputIterator> + segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); + + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + + Tensor temp_values; + Tensor temp_indices; + + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); + + if (k == num_cols) { + // Doing a full sort. + sorted_values_ptr = values; + sorted_indices_ptr = indices; + } else { + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); + sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); + } + + // Get temp storage buffer size, maybe can allocate a fixed buffer to save + // time. + if (largest) { + auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, input, sorted_values_ptr, + input_indices.data(), sorted_indices_ptr, num_cols * num_rows, + num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, + cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) + << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate " + "temp_storage_bytes, status: " + << cudaGetErrorString(err); + return false; + } + } else { + auto err = cub::DeviceSegmentedRadixSort::SortPairs( + nullptr, temp_storage_bytes, input, sorted_values_ptr, + input_indices.data(), sorted_indices_ptr, num_cols * num_rows, + num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, + cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs to calculate " + "temp_storage_bytes, status: " + << cudaGetErrorString(err); + return false; + } + } + Tensor temp_storage; + temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); + + if (largest) { + auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + temp_storage.data(), temp_storage_bytes, input, + sorted_values_ptr, input_indices.data(), sorted_indices_ptr, + num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, + 0, sizeof(T) * 8, cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to " + "sort input, " + "temp_storage_bytes: " + << temp_storage_bytes + << ", status: " << cudaGetErrorString(err); + return false; + } + } else { + auto err = cub::DeviceSegmentedRadixSort::SortPairs( + temp_storage.data(), temp_storage_bytes, input, + sorted_values_ptr, input_indices.data(), sorted_indices_ptr, + num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, + 0, sizeof(T) * 8, cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs to " + "sort input, " + "temp_storage_bytes: " + << temp_storage_bytes + << ", status: " << cudaGetErrorString(err); + return false; + } + } + auto& dev = *ctx.eigen_device(); + if (k < num_cols) { + // copy sliced data to output. + const Eigen::DSizes slice_indices{0, 0}; + const Eigen::DSizes slice_sizes{num_rows, k}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = EigenMatrix::From(temp_indices); + + std::vector odims = {static_cast(num_rows), static_cast(k)}; + auto dim = framework::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = EigenMatrix::From(temp_values); + + e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); + e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); + } + return true; +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 82ecc2887ba240560cf15165f21bc995f4683159..d8b2e92616091a8c822c6fd0bfdfb1148c25534d 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once #include +#include #include "cub/cub.cuh" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_function_cuda.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" // set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -struct Pair { - __device__ __forceinline__ Pair() {} - __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} - - __device__ __forceinline__ void set(T value, int64_t id) { - v = value; - id = id; - } - - __device__ __forceinline__ void operator=(const Pair& in) { - v = in.v; - id = in.id; - } - - __device__ __forceinline__ bool operator<(const T value) const { - return (v < value); - } - - __device__ __forceinline__ bool operator<(const Pair& in) const { - return (v < in.v) || ((v == in.v) && (id > in.id)); - } - - __device__ __forceinline__ bool operator>(const Pair& in) const { - return (v > in.v) || ((v == in.v) && (id < in.id)); - } - - T v; - int64_t id; -}; - -template -__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, - int beam_size) { - for (int k = beam_size - 2; k >= 0; k--) { - if (topk[k] < p) { - topk[k + 1] = topk[k]; - } else { - topk[k + 1] = p; - return; - } - } - topk[0] = p; -} - -template -__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p) { - for (int k = beam_size - 2; k >= 0; k--) { - if (topk[k] < p) { - topk[k + 1] = topk[k]; - } else { - topk[k + 1] = p; - return; - } - } - topk[0] = p; -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, - int dim, int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < src[idx]) { - Pair tmp(src[idx], idx); - AddTo(topk, tmp, beam_size); - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, - int dim, const Pair& max, - int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < src[idx]) { - Pair tmp(src[idx], idx); - if (tmp < max) { - AddTo(topk, tmp, beam_size); - } - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, - int idx, int dim, int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < val[idx]) { - Pair tmp(val[idx], col[idx]); - AddTo(topk, tmp, beam_size); - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, - int idx, int dim, const Pair& max, - int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < val[idx]) { - Pair tmp(val[idx], col[idx]); - if (tmp < max) { - AddTo(topk, tmp, beam_size); - } - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, - int beam_size, const T* src, - bool* firstStep, bool* is_empty, - Pair* max, int dim, - const int tid) { - if (*beam > 0) { - int length = (*beam) < beam_size ? *beam : beam_size; - if (*firstStep) { - *firstStep = false; - GetTopK(topk, src, tid, dim, length); - } else { - for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - (*beam)) { - topk[k] = topk[k + *beam]; - } else { - topk[k].set(-static_cast(INFINITY), -1); - } - } - if (!(*is_empty)) { - GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, - length); - } - } - - *max = topk[MaxLength - 1]; - if ((*max).v == -static_cast(1)) *is_empty = true; - *beam = 0; - } -} - -template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, - int beam_size, const T* val, - int* col, bool* firstStep, - bool* is_empty, Pair* max, - int dim, const int tid) { - if (*beam > 0) { - int length = (*beam) < beam_size ? *beam : beam_size; - if (*firstStep) { - *firstStep = false; - GetTopK(topk, val, col, tid, dim, length); - } else { - for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - *beam) { - topk[k] = topk[k + *beam]; - } else { - topk[k].set(-static_cast(INFINITY), -1); - } - } - if (!(*is_empty)) { - GetTopK(topk + MaxLength - *beam, val, col, tid, dim, max, - length); - } - } - - *max = topk[MaxLength - 1]; - if ((*max).v == -1) *is_empty = true; - *beam = 0; - } -} - -template -__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, - Pair topk[], T** topVal, - int64_t** topIds, int* beam, int* k, - const int tid, const int warp) { - while (true) { - __syncthreads(); - if (tid < BlockSize / 2) { - if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { - maxid[tid] = tid + BlockSize / 2; - } else { - maxid[tid] = tid; - } - } - __syncthreads(); - for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { - if (tid < stride) { - if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { - maxid[tid] = maxid[tid + stride]; - } - } - __syncthreads(); - } - __syncthreads(); - - if (tid == 0) { - **topVal = sh_topk[maxid[0]].v; - **topIds = sh_topk[maxid[0]].id; - (*topVal)++; - (*topIds)++; - } - if (tid == maxid[0]) (*beam)++; - if (--(*k) == 0) break; - __syncthreads(); - - if (tid == maxid[0]) { - if (*beam < MaxLength) { - sh_topk[tid] = topk[*beam]; - } - } - // NOTE(zcd): temporary solution - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, true); - - if (maxid[0] / 32 == warp) { - if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) == - MaxLength) - break; - } - } -} - -/** - * Each block compute one sample. - * In a block: - * 1. every thread get top MaxLength value; - * 2. merge to sh_topk, block reduce and get max value; - * 3. go to the second setp, until one thread's topk value is null; - * 4. go to the first setp, until get the topk value. - */ - -template -__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, - const T* src, int lds, int dim, int k, - int grid_dim, int num) { - __shared__ Pair sh_topk[BlockSize]; - const int tid = threadIdx.x; - const int warp = threadIdx.x / 32; - - const int bid = blockIdx.x; - for (int i = bid; i < num; i += grid_dim) { - int top_num = k; - __shared__ int maxid[BlockSize / 2]; - T* out = output + i * output_stride; - int64_t* inds = indices + i * k; - Pair topk[MaxLength]; - int beam = MaxLength; - Pair max; - bool is_empty = false; - bool firststep = true; - - for (int j = 0; j < MaxLength; j++) { - topk[j].set(-static_cast(INFINITY), -1); - } - while (top_num) { - ThreadGetTopK( - topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid); - - sh_topk[tid] = topk[0]; - BlockReduce(sh_topk, maxid, topk, &out, &inds, - &beam, &top_num, tid, warp); - } - } -} - -template -__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad, - size_t rows, size_t cols, size_t k) { - for (size_t i = 0; i < rows; ++i) { - for (size_t j = 0; j < cols; ++j) { - x_grad[i * cols + j] = 0; - } - for (size_t j = 0; j < k; ++j) { - size_t idx = indices[i * k + j]; - x_grad[i * cols + idx] = out_grad[i * k + j]; - } - } -} - -inline static int GetDesiredBlockDim(int dim) { - if (dim > 128) { - return 256; - } else if (dim > 64) { - return 128; - } else if (dim > 32) { - return 64; - } else { - return 32; - } -} - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -// Iter using into a column -struct ColumnIndexIter { - explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()( - const Eigen::array& ix) const { - return ix[0] % num_cols_; - } - - int num_cols_; -}; - -__global__ void InitIndex(int64_t* indices, int64_t num_rows, - int64_t num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (int64_t j = row_id; j < num_rows; j += gridDim.x) { - for (int64_t i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -bool SortTopk(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = framework::make_ddim(dims); - input_indices.Resize(dim); - // input_indices.Resize(num_rows*num_cols); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; - // actually, int num_rows < max_grid_size - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - // Init a index array - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - - Tensor temp_values; - Tensor temp_indices; - - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - - if (k == num_cols) { - // Doing a full sort. - sorted_values_ptr = values; - sorted_indices_ptr = indices; - } else { - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - } - - // Get temp storage buffer size, maybe can allocate a fixed buffer to save - // time. - auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - if (err != cudaSuccess) { - LOG(ERROR) - << "TopKOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate " - "temp_storage_bytes, status: " - << cudaGetErrorString(err); - return false; - } - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); - if (err != cudaSuccess) { - LOG(ERROR) - << "TopKOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, " - "temp_storage_bytes: " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } - auto& dev = *ctx.eigen_device(); - if (k < num_cols) { - // copy sliced data to output. - const Eigen::DSizes slice_indices{0, 0}; - const Eigen::DSizes slice_sizes{num_rows, k}; - auto e_indices = EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = EigenMatrix::From(temp_indices); - - std::vector odims = {static_cast(num_rows), static_cast(k)}; - auto dim = framework::make_ddim(odims); - auto e_values = EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = EigenMatrix::From(temp_values); - - e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); - e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); - } - return true; -} - #define FIXED_BLOCK_DIM_BASE(dim, ...) \ case (dim): { \ constexpr auto kBlockDim = (dim); \ @@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel { framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); const int64_t input_width = inputdims[inputdims.size() - 1]; const auto& dev_ctx = ctx.cuda_device_context(); - if ((input_width <= 1024 || k >= 128 || k == input_width)) { if (SortTopk(dev_ctx, input, input_width, input_height, k, output, indices)) { @@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1)); const size_t col = xdims[xdims.size() - 1]; const auto& dev_ctx = context.cuda_device_context(); - const int kMaxHeight = 2048; int gridx = row < kMaxHeight ? row : kMaxHeight; switch (GetDesiredBlockDim(col)) { @@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle - REGISTER_OP_CUDA_KERNEL( top_k, paddle::operators::TopkOpCUDAKernel + +namespace paddle { +namespace operators { + +class TopkV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Indices"), + "Output(Indices) of TopkOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + const int& dim_size = input_dims.size(); + const int k = static_cast(ctx->Attrs().Get("k")); + int axis = static_cast(ctx->Attrs().Get("axis")); + PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true, + "the axis of topk" + "must be [-%d, %d), but you set axis is %d", + dim_size, dim_size, axis); + + if (axis < 0) axis += dim_size; + + PADDLE_ENFORCE_GE( + k, 1, "the attribute of k in the topk must >= 1, but received %d .", k); + PADDLE_ENFORCE_GE(input_dims.size(), 1, + "input of topk must have >= 1d shape"); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GE( + input_dims[axis], k, + "input of topk op must have >= %d columns in axis of %d", k, axis); + } + + framework::DDim dims = input_dims; + + dims[axis] = k; + ctx->SetOutputDim("Out", dims); + ctx->SetOutputDim("Indices", dims); + ctx->ShareLoD("X", "Out"); + ctx->ShareLoD("X", "Indices"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(), + layout_, library_); + } +}; + +class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The input of Topk op"); + AddInput("K", + "(Tensor) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .AsDispensable(); + AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); + AddComment(R"DOC( +Top K operator + +If the input is a vector (1d tensor), this operator finds the k largest +entries in the vector and outputs their values and indices as vectors. +Thus values[j] is the j-th largest entry in input, and its index is indices[j]. + +For matrices, this operator computes the top k entries in each row. )DOC"); + AddAttr("k", + "(int, default 1) Number of top elements to look for along " + "the tensor).") + .SetDefault(1); + AddAttr("axis", + "the axis to sort and get the k indices, value." + "if not set, will get k value in last axis.") + .SetDefault(-1); + AddAttr("largest", + "control flag whether to return largest or smallest") + .SetDefault(true); + AddAttr("sorted", + "control flag whether to return elements in sorted order") + .SetDefault(true); + } +}; + +class TopkV2OpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::InvalidArgument("Input(X) should be not null")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Indices"), true, + platform::errors::InvalidArgument("Input(Indices) should be not null")); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + platform::errors::InvalidArgument( + "Grad Input(Out) should be not null")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput(framework::GradVarName("X")), true, + platform::errors::InvalidArgument("Grad Output(X) should be not null")); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class TopkV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("top_k_v2_grad"); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput("X", this->Input("X")); + op->SetInput("Indices", this->Output("Indices")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, + ops::TopkV2GradOpMaker, + ops::TopkV2GradOpMaker); + +REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); + +REGISTER_OP_CPU_KERNEL(top_k_v2, + ops::TopkV2Kernel, + ops::TopkV2Kernel, + ops::TopkV2Kernel, + ops::TopkV2Kernel) + +REGISTER_OP_CPU_KERNEL( + top_k_v2_grad, ops::TopkV2GradKernel, + ops::TopkV2GradKernel, + ops::TopkV2GradKernel, + ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5154503292014fa43efae919f20d731060b9db57 --- /dev/null +++ b/paddle/fluid/operators/top_k_v2_op.cu @@ -0,0 +1,272 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/p_norm_op.h" +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/fluid/operators/top_k_v2_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +#define FIXED_BLOCK_DIM_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kBlockDim = (dim); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM(...) \ + FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) + +template +class TopkV2OpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + + // get the attributes + int k = static_cast(ctx.Attr("k")); + int axis = static_cast(ctx.Attr("axis")); + const bool& sorted = static_cast(ctx.Attr("sorted")); + const bool& largest = static_cast(ctx.Attr("largest")); + + // get the input dims + const auto& in_dims = input->dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto* k_t = ctx.Input("K"); + if (k_t) { + Tensor k_host; + framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); + k = k_host.data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[axis] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + + const auto& out_dims = output->dims(); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + if (axis == in_dims.size() - 1) { + // if get the topK from the last axis + const int64_t& input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + const auto& dev_ctx = ctx.cuda_device_context(); + + if (k > input_width) k = input_width; + + if ((input_width <= 1024 || k >= 128 || k == input_width)) { + if (SortTopk(dev_ctx, input, input_width, input_height, k, output, + indices, largest)) { + // Successed, return. + return; + } else { + LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " + "default topk kernel."; + } + } + + // NOTE: pass lds and dim same to input width. + // NOTE: old matrix implementation of stride is different to eigen. + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + output_data, k, indices_data, input_data, input_width, + input_width, static_cast(k), gridx, input_height, + largest)); + default: + PADDLE_THROW(platform::errors::Fatal( + "the input data shape has error in the topk cuda kernel.")); + } + } else { + // if get topK not from the last axis, will tranpose the tensor and get + // TopK + + // first step, prepare the trans args for the tranpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + + framework::DDim trans_dims(in_dims); + framework::DDim trans_out_dims(output->dims()); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = out_dims[trans[i]]; + } + // second step, tranpose the input + Tensor trans_input; + trans_input.mutable_data(trans_dims, ctx.GetPlace()); + int ndims = trans.size(); + const auto& dev_ctx = ctx.cuda_device_context(); + TransCompute(ndims, dev_ctx, *input, + &trans_input, trans); + // third step, calcluate the topk + // allocate the tmp cuda memory for the tmp result + Tensor trans_ind; + trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); + Tensor trans_out; + trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); + + const int64_t input_height = framework::product( + framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + + if (k > input_width) k = input_width; + + if ((input_width <= 1024 || k >= 128 || k == input_width)) { + if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, + &trans_out, &trans_ind, largest)) { + // last step, tranpose back the indices and output + TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + TransCompute( + ndims, dev_ctx, trans_out, output, trans); + return; + } else { + LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " + "default topk kernel."; + } + } + + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + trans_out.data(), k, trans_ind.data(), + trans_input.data(), input_width, input_width, + static_cast(k), gridx, input_height, largest)); + default: + PADDLE_THROW(platform::errors::Fatal( + "the input data shape has error in the topk cuda kernel.")); + } + + // last step, tranpose back the indices and output + TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + TransCompute(ndims, dev_ctx, trans_out, + output, trans); + } + } +}; + +#undef FIXED_BLOCK_DIM_BASE +#undef FIXED_BLOCK_DIM +template +class TopkV2OpGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(context.GetPlace()), true, + platform::errors::InvalidArgument("It must use CUDAPlace.")); + auto* x = context.Input("X"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* indices = context.Input("Indices"); + auto* x_grad = context.Output(framework::GradVarName("X")); + int axis = context.Attr("axis"); + + const auto& in_dims = x->dims(); + const auto& out_dims = indices->dims(); + + // get the real the axis and the k + if (axis < 0) axis += in_dims.size(); + const int& k = out_dims[axis]; + const int& raw_height = in_dims[axis]; + + // allocate the cuda memory for the x_grad + T* x_grad_data = x_grad->mutable_data(context.GetPlace()); + const T* out_grad_data = out_grad->data(); + const int64_t* indices_data = indices->data(); + + int pre, n, post; + GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + auto& dev_ctx = context.cuda_device_context(); + auto ComputeBlockSize = [](int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; + }; + int block_size = ComputeBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + + // lanuch the cuda kernel to assign the grad + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, k); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL( + top_k_v2, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, float>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, double>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, int>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, int64_t>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a77285d123644e6ea2b9077f3338b92add42f7f0 --- /dev/null +++ b/paddle/fluid/operators/top_k_v2_op.h @@ -0,0 +1,321 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + The reason why we need the topk v2 is because the compatibility. We redefine + the NaN is maximum value + in the process of comparing. If do not add the topk v2, will affect the + inference result of model that traing + by the older version paddlepaddle. +*/ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/operators/transpose_op.h" + +namespace paddle { +namespace operators { + +template +static void FullTopK(Type input_height, Type input_width, int input_dim, + const framework::Tensor* input, T* t_out, Type* t_indices, + const int& k, const bool& largest, const bool& sorted) { + // when the k is small, will the partial sort + bool partial_sort_flag = (k * 64) < input_width; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + // Eigen::DSizes flat2dims(input_height, input_width); + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), col_vec.begin() + k, col_vec.end(), + [&largest](const std::pair& l, const std::pair& r) { + if (largest) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + } else { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + } + }); + } else { + // use the nth-element to get the K-larger or K-small element + if (largest) { + std::nth_element( + col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + }); + // the nth-element will get the unorder elements, sort the element + if (sorted) { + std::sort(col_vec.begin(), col_vec.begin() + k - 1, + [&largest](const std::pair& l, + const std::pair& r) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + }); + } + } else { + std::nth_element( + col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + // the nth-element will get the unorder elements, sort the element + if (sorted) { + std::sort( + col_vec.begin(), col_vec.begin() + k - 1, + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + } + } + for (Type j = 0; j < k; ++j) { + t_out[i * k + j] = col_vec[j].first; + t_indices[i * k + j] = col_vec[j].second; + } + } +} + +template +static void FullTopKAssign(const Type& input_height, const Type& input_width, + const int& input_dim, const framework::Tensor* input, + const framework::Tensor* indices, T* output_data, + const int& k) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + for (Type j = 0; j < k; ++j) { + output_data[i * input_width + e_indices(j)] = e_input(j); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + for (Type j = 0; j < k; ++j) { + output_data[i * input_width + e_indices(i, j)] = e_input(i, j); + } + } + } +} + +template +class TopkV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Get the top k elements of each row of input tensor + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + auto* indices = context.Output("Indices"); + const auto& in_dims = input->dims(); + int k = static_cast(context.Attr("k")); + const auto& sorted = static_cast(context.Attr("sorted")); + const auto& largest = static_cast(context.Attr("largest")); + + // axis < 0, cacluate the real axis + int axis = static_cast(context.Attr("axis")); + if (axis < 0) axis += in_dims.size(); + + // if K tensor is not null, will the use K tesnor as k + auto* k_t = context.Input("K"); + if (k_t) { + k = k_t->data()[0]; + framework::DDim output_dims = output->dims(); + // accroding to axis to set K value in the dim + output_dims[axis] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + + T* output_data = output->mutable_data(context.GetPlace()); + int64_t* indices_data = indices->mutable_data(context.GetPlace()); + const auto& out_dims = output->dims(); + if (axis + 1 == in_dims.size()) { + const int64_t& input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + FullTopK(input_height, input_width, in_dims.size(), input, + output_data, indices_data, k, largest, sorted); + } else { + // if the topk dims is not last dim, will tranpose and do topk + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + + // get the trans input_dims, out_dims + framework::DDim trans_dims(in_dims); + framework::DDim trans_out_dims(output->dims()); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + } + for (size_t i = 0; i < trans.size(); i++) { + trans_out_dims[i] = out_dims[trans[i]]; + } + + Tensor trans_inp; + trans_inp.mutable_data(trans_dims, context.GetPlace()); + int ndims = trans.size(); + auto& dev_context = + context.template device_context(); + + // transpose the input value + TransCompute(ndims, dev_context, *input, + &trans_inp, trans); + + const int64_t input_height = framework::product( + framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + + // Allocate the temp tensor to the save the topk indices, values + Tensor tmp_out; + T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); + Tensor tmp_indices; + auto* t_ind = + tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); + + // get the TopK value + FullTopK(input_height, input_width, in_dims.size(), + &trans_inp, t_out, t_ind, k, largest, sorted); + // transpose back + TransCompute( + ndims, dev_context, tmp_indices, indices, trans); + TransCompute(ndims, dev_context, tmp_out, + output, trans); + } + } +}; + +template +class TopkV2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* indices = context.Input("Indices"); + auto* x_grad = context.Output(framework::GradVarName("X")); + int axis = static_cast(context.Attr("axis")); + + const auto& in_dims = x->dims(); + const auto& out_dims = indices->dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + const size_t& k = out_dims[axis]; + + T* x_grad_data = x_grad->mutable_data(context.GetPlace()); + if (axis + 1 == in_dims.size()) { + // allocate the memory for the input_grad + + // assign the out_grad to input_grad directly + const int64_t input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, + indices, x_grad_data, k); + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + framework::DDim trans_dims(out_dims); + framework::DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + // transpose the out_grad, indices + Tensor trans_dO; + trans_dO.mutable_data(trans_dims, context.GetPlace()); + Tensor trans_ind; + trans_ind.mutable_data(trans_dims, context.GetPlace()); + int ndims = trans.size(); + auto& dev_context = + context.template device_context(); + + // Do transpose + TransCompute(ndims, dev_context, *out_grad, + &trans_dO, trans); + TransCompute( + ndims, dev_context, *indices, &trans_ind, trans); + const int64_t input_height = framework::product( + framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + + // Assign the out_grad to tranpose input_grad + Tensor tmp_out; + T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + FullTopKAssign(input_height, input_width, in_dims.size(), + &trans_dO, &trans_ind, t_out, k); + + // Transpose back + TransCompute(ndims, dev_context, tmp_out, + x_grad, trans); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc index c141033b2b3e6b9fdeac88610dd1362ba8f98428..1aea96a15eb090ccd1a508641e4c6c0a8dcf7fb9 100644 --- a/paddle/fluid/operators/unique_op.cc +++ b/paddle/fluid/operators/unique_op.cc @@ -24,17 +24,63 @@ class UniqueOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique"); - OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); - auto in_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_dims.size(), 1, - platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, " - "But now the dims of Input(X) is %d.", - in_dims.size())); + if (!ctx->Attrs().Get("is_sorted")) { + OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + PADDLE_ENFORCE_EQ(in_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input(X) should be 1-D Tensor, " + "But now the dims of Input(X) is %d.", + in_dims.size())); + + ctx->SetOutputDim("Out", {-1}); + ctx->SetOutputDim("Index", in_dims); + return; + } + + bool return_index = ctx->Attrs().Get("return_index"); + bool return_inverse = ctx->Attrs().Get("return_inverse"); + bool return_counts = ctx->Attrs().Get("return_counts"); + auto axis_vec = ctx->Attrs().Get>("axis"); + + if (return_index) { + OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique"); + } + if (return_inverse) { + OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + } + if (return_counts) { + OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique"); + } - ctx->SetOutputDim("Out", {-1}); - ctx->SetOutputDim("Index", in_dims); + if (axis_vec.empty()) { + ctx->SetOutputDim("Out", {-1}); + if (return_inverse) { + ctx->SetOutputDim("Index", {framework::product(in_dims)}); + } + } else { + int axis = axis_vec[0]; + if (axis < 0) { + axis += in_dims.size(); + } + PADDLE_ENFORCE_LT( + axis, in_dims.size(), + platform::errors::InvalidArgument("The axis(%d) should be less than " + "the dimension size(%d) of x.", + axis, in_dims.size())); + auto out_dims = in_dims; + out_dims[axis] = -1; + ctx->SetOutputDim("Out", out_dims); + if (return_inverse) { + ctx->SetOutputDim("Index", {in_dims[axis]}); + } + } + if (return_index) { + ctx->SetOutputDim("Indices", {-1}); + } + if (return_counts) { + ctx->SetOutputDim("Counts", {-1}); + } } protected: @@ -49,14 +95,47 @@ class UniqueOp : public framework::OperatorWithKernel { class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "Input tensor. It should be a 1-D tensor."); + AddInput("X", + "Input tensor. It should be a 1-D tensor when Attr(is_sorted)" + " is fasle or a N-D tensor when Attr(is_sorted) is true."); AddAttr("dtype", "data type for output index"); AddOutput("Out", "A unique subsequence for input tensor."); AddOutput("Index", - "An index tensor pointing to unique subsequence, which has " - "identical shape with input tensor and int64 dtype."); + "Equivalent to inverse in numpy.unique, " + "the indices for where elements in the original input ended up " + "in the returned unique tensor."); + AddOutput( + "Indices", + "The indices of the input tensor that result in the unique tensor.") + .AsDispensable(); + AddOutput("Counts", "The counts for each unique element.").AsDispensable(); + AddAttr("return_index", + "If True, also return the indices of the input" + " tensor that result in the unique Tensor.") + .SetDefault(false); + AddAttr( + "return_inverse", + "If True, also return the indices for where elements" + " in the original input ended up in the returned unique tensor.") + .SetDefault(false); + AddAttr("return_counts", + "If True, also return the counts for each unique element.") + .SetDefault(false); + AddAttr>( + "axis", + "The axis to apply unique. If None, the input will be flattened.") + .SetDefault({}); + AddAttr("is_sorted", + "If True, the unique elements of X are in ascending order." + "Otherwise, the unique elements are not sorted.") + .SetDefault(false); AddComment(R"DOC( - Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence + 1. Return a unique subsequence for 1-D input tensor, and an index tensor + pointing to this unique subsequence when Attr(is_sorted) is false. This + means paddle.unique is called. + + 2. Returns the unique elements of X in ascending order when Attr(is_sorted) + is true. This means fluid.layers.unique is called. )DOC"); } }; @@ -65,6 +144,8 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker); -REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel, - ops::UniqueKernel, ops::UniqueKernel, - ops::UniqueKernel); +REGISTER_OP_CPU_KERNEL( + unique, ops::UniqueKernel, + ops::UniqueKernel, + ops::UniqueKernel, + ops::UniqueKernel); diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index cdfd797cbfdf87a42ce0834eea9467010a058431..dc8b2ac5555126d8cf2bb92d2f506b1bf358e680 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -13,12 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include +#include +#include #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { @@ -104,17 +109,243 @@ struct UniqueOpFunctor { } }; +static std::vector Unbind(const framework::Tensor& in) { + int64_t size = in.dims()[0]; + std::vector tensors(size); + for (int64_t i = 0; i < size; ++i) { + tensors[i] = in.Slice(i, i + 1); + } + return tensors; +} + +template +static bool Equal(const framework::Tensor& a, const framework::Tensor& b) { + if (a.numel() != b.numel()) { + return false; + } + for (int64_t i = 0; i < a.numel(); ++i) { + if (a.data()[i] != b.data()[i]) { + return false; + } + } + return true; +} + template +static void UniqueFlattendTensor(const framework::ExecutionContext& context, + const framework::Tensor& in, + framework::Tensor* out, bool return_index, + bool return_inverse, bool return_counts) { + const T* in_data = in.data(); + std::set unique(in_data, in_data + in.numel()); + out->Resize(framework::make_ddim({static_cast(unique.size())})); + auto out_data = out->mutable_data(context.GetPlace()); + std::copy(unique.begin(), unique.end(), out_data); + + if (return_index) { + auto* indices = context.Output("Indices"); + indices->Resize(framework::make_ddim({out->numel()})); + auto indices_data = indices->mutable_data(context.GetPlace()); + std::unordered_map indices_map; + indices_map.reserve(out->numel()); + for (int64_t i = 0; i < in.numel(); ++i) { + if (indices_map.find(in_data[i]) != indices_map.end()) continue; + indices_map[in_data[i]] = i; + } + for (int64_t i = 0; i < out->numel(); ++i) { + indices_data[i] = indices_map[out_data[i]]; + } + } + + if (return_inverse) { + auto* inverse = context.Output("Index"); + inverse->Resize(framework::make_ddim({in.numel()})); + auto inverse_data = inverse->mutable_data(context.GetPlace()); + std::unordered_map inverse_map; + inverse_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + inverse_map[out_data[i]] = i; + } + for (int64_t i = 0; i < in.numel(); ++i) { + inverse_data[i] = inverse_map[in_data[i]]; + } + } + + if (return_counts) { + auto* count = context.Output("Counts"); + count->Resize(framework::make_ddim({out->numel()})); + auto count_data = count->mutable_data(context.GetPlace()); + std::unordered_map counts_map; + counts_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + counts_map[out_data[i]] = 0; + } + for (int64_t i = 0; i < in.numel(); i++) { + counts_map[in_data[i]] += 1; + } + for (int64_t i = 0; i < out->numel(); i++) { + count_data[i] = counts_map[out_data[i]]; + } + } +} + +template +static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context, + ForwardIt first, ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, + std::vector* counts_vec, + std::vector* indices_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + (*indices_vec)[0] = sorted_indices_vec[0]; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + (*indices_vec)[idx_result] = sorted_indices_vec[idx_first]; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +static void UniqueDim(const framework::ExecutionContext& context, + const framework::Tensor& in, framework::Tensor* out, + bool return_index, bool return_inverse, + bool return_counts, int axis) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(framework::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + framework::Tensor in_trans; + framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + in_trans.mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + TransCompute(in.dims().size(), dev_ctx, in, &in_trans, + permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + framework::DDim in_trans_flat_dims = + framework::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // sort indices + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const T* in_trans_data = in_trans.data(); + std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < col; ++i) { + T lhs = in_trans_data[i + a * col]; + T rhs = in_trans_data[i + b * col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + // sort tensor according to indices + framework::Tensor input_sorted; + input_sorted.Resize(in_trans_dims); + input_sorted.mutable_data(context.GetPlace()); + T* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T)); + } + + std::vector input_unbind = Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + std::vector indices_vec(sorted_indices_vec.size(), 0); + auto last = UniqueDimImpl::iterator, T>( + context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec, + &inverse_vec, &counts_vec, &indices_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + indices_vec.erase(indices_vec.begin() + input_unbind.size(), + indices_vec.end()); + + math::ConcatFunctor concat_functor; + framework::Tensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(framework::make_ddim(out_trans_dims_vec)); + out_trans.mutable_data(context.GetPlace()); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(framework::make_ddim(out_trans_dims_vec)); + out->mutable_data(context.GetPlace()); + concat_functor(dev_ctx, input_unbind, 0, &out_trans); + TransCompute(out_trans.dims().size(), dev_ctx, out_trans, + out, permute); + + if (return_inverse) { + auto* inverse = context.Output("Index"); + framework::TensorFromVector(inverse_vec, context.device_context(), inverse); + } + + if (return_counts) { + auto* count = context.Output("Counts"); + framework::TensorFromVector(counts_vec, context.device_context(), count); + } + + if (return_index) { + auto* indices = context.Output("Indices"); + framework::TensorFromVector(indices_vec, context.device_context(), indices); + } +} + +template class UniqueKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto data_type = static_cast( - context.Attr("dtype")); auto* x = context.Input("X"); auto* out = context.Output("Out"); - auto* index = context.Output("Index"); + if (!context.Attr("is_sorted")) { + auto data_type = static_cast( + context.Attr("dtype")); + auto* index = context.Output("Index"); + + framework::VisitDataType(data_type, UniqueOpFunctor(out, index, x)); + return; + } - framework::VisitDataType(data_type, UniqueOpFunctor(out, index, x)); + std::vector axis_vec = context.Attr>("axis"); + bool return_index = context.Attr("return_index"); + bool return_inverse = context.Attr("return_inverse"); + bool return_counts = context.Attr("return_counts"); + + if (axis_vec.empty()) { + UniqueFlattendTensor(context, *x, out, return_index, return_inverse, + return_counts); + } else { + int axis = axis_vec[0]; + UniqueDim(context, *x, out, return_index, + return_inverse, return_counts, axis); + } } }; diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index ec458ee7957c6a496ad4bd5579fe0b3c8a72069d..d7126b958650acfdf10a1b75ec5e8ee51643c008 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -62,6 +62,7 @@ std::map> op_outs_map = { {"sync_batch_norm", {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", "ReserveSpace"}}, + {"unique", {"Out", "Index", "Indices", "Counts"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 0b4e4502bb816c4ca293e54d5f78e8e504df2b48..4377a8c2cef5aab7a200955cd25830d448014817 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -564,9 +565,9 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, if (!is_gpu_tensor && !is_xpu_tensor) { if (!need_deep_copy) { - return py::array(py::buffer_info( - const_cast(tensor_buf_ptr), sizeof_dtype, py_dtype_str, - static_cast(tensor.dims().size()), py_dims, py_strides)); + auto base = py::cast(std::move(tensor)); + return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, + const_cast(tensor_buf_ptr), base); } else { py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt index 04870f87c40dd305304579a454cb618bf1446e39..1f88eb2109aa23b6b60104451908b0a70c41c898 100644 --- a/paddle/fluid/train/CMakeLists.txt +++ b/paddle/fluid/train/CMakeLists.txt @@ -29,6 +29,8 @@ function(train_test TARGET_NAME) PROPERTIES DEPENDS test_${TARGET_NAME}) set_tests_properties(test_train_${TARGET_NAME}${arg} PROPERTIES LABELS "RUN_TYPE=DIST") + set_tests_properties(test_train_${TARGET_NAME}${arg} + PROPERTIES TIMEOUT 150) endforeach() endfunction(train_test) diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md index 39db5a601d3d46c106a574870f02434bd4bd5cd1..d7a86b653bec44c260a845d454c771ec4440993b 100644 --- a/paddle/scripts/README.md +++ b/paddle/scripts/README.md @@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" | `WITH_STYLE_CHECK` | ON | Check the code style when building. | | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu | | `RUN_TEST` | OFF | Run unit test immediently after the build. | -| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` | ## Docker Images @@ -155,21 +154,6 @@ docker push kubectl ... ``` -### Reading source code with woboq codebrowser - -For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser). - -- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host: - -```bash -./paddle/scripts/paddle_docker_build.sh html -``` - -- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run: - -``` -docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx -``` ## More Options diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 962ee53a1069b8cd7863a0f1dca616c939eb237d..57defebd7575b41c031957aa9c2f848861a006ac 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -529,13 +529,16 @@ EOF pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi ut_startTime_s=`date +%s` - ctest --output-on-failure -j $2 + ctest --output-on-failure -j $2;mactest_error=$? ut_endTime_s=`date +%s` echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s" paddle version # Recovery proxy to avoid failure in later steps export http_proxy=$my_proxy export https_proxy=$my_proxy + if [ "$mactest_error" != 0 ];then + exit 8; + fi fi } @@ -1105,22 +1108,6 @@ EOF esac } -function gen_html() { - cat < 0: + self.fc = Sequential( + Linear(400, 120), + Linear(120, 84), + Linear( + 84, 10, act=classifier_activation)) + + @declarative + def forward(self, inputs): + x = self.features(inputs) + + if self.num_classes > 0: + x = fluid.layers.flatten(x, 1) + x = self.fc(x) + return x + + class MnistDataset(MNIST): def __init__(self, mode, return_label=True, sample_num=None): super(MnistDataset, self).__init__(mode=mode) @@ -335,7 +368,6 @@ class TestModelFunction(unittest.TestCase): model = Model(net, inputs, labels) model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum")) loss, = model.train_batch([data], [label]) - np.testing.assert_allclose(loss.flatten(), ref.flatten()) fluid.disable_dygraph() if dynamic else None @@ -445,33 +477,38 @@ class TestModelFunction(unittest.TestCase): fluid.disable_dygraph() if dynamic else None def test_export_deploy_model(self): - net = LeNet() - inputs = [Input([-1, 1, 28, 28], 'float32', 'image')] - model = Model(net, inputs) - model.prepare() - save_dir = tempfile.mkdtemp() - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - tensor_img = np.array( - np.random.random((1, 1, 28, 28)), dtype=np.float32) - ori_results = model.test_batch(tensor_img) - - model.save_inference_model(save_dir) - - place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda( - ) else fluid.CUDAPlace(0) - exe = fluid.Executor(place) - [inference_program, feed_target_names, fetch_targets] = ( - fluid.io.load_inference_model( - dirname=save_dir, executor=exe)) - - results = exe.run(inference_program, - feed={feed_target_names[0]: tensor_img}, - fetch_list=fetch_targets) + for dynamic in [True, False]: + fluid.enable_dygraph() if dynamic else None + # paddle.disable_static() if dynamic else None + prog_translator = ProgramTranslator() + prog_translator.enable(False) if not dynamic else None + net = LeNetDeclarative() + inputs = [Input([None, 1, 28, 28], 'float32', 'x')] + model = Model(net, inputs) + model.prepare() + save_dir = tempfile.mkdtemp() + if not os.path.exists(save_dir): + os.makedirs(save_dir) + tensor_img = np.array( + np.random.random((1, 1, 28, 28)), dtype=np.float32) + ori_results = model.test_batch(tensor_img) + model.save(save_dir, training=False) + fluid.disable_dygraph() if dynamic else None - np.testing.assert_allclose(results, ori_results, rtol=1e-6) - shutil.rmtree(save_dir) + place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + new_scope = fluid.Scope() + with fluid.scope_guard(new_scope): + exe = fluid.Executor(place) + [inference_program, feed_target_names, fetch_targets] = ( + fluid.io.load_inference_model( + dirname=save_dir, executor=exe)) + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + np.testing.assert_allclose( + results, ori_results, rtol=1e-5, atol=1e-7) + shutil.rmtree(save_dir) class TestRaiseError(unittest.TestCase): diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index dd5d0d269a6cb6377b11b6c98e86eef4ee0f8b57..07b3f0d284dcd28d4967131ab85bb2ca3cd1d6da 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -128,6 +128,12 @@ from .layer.norm import GroupNorm #DEFINE_ALIAS from .layer.norm import LayerNorm #DEFINE_ALIAS from .layer.norm import SpectralNorm #DEFINE_ALIAS from .layer.norm import InstanceNorm #DEFINE_ALIAS +from .layer.norm import InstanceNorm1d #DEFINE_ALIAS +from .layer.norm import InstanceNorm2d #DEFINE_ALIAS +from .layer.norm import InstanceNorm3d #DEFINE_ALIAS +from .layer.norm import BatchNorm1d #DEFINE_ALIAS +from .layer.norm import BatchNorm2d #DEFINE_ALIAS +from .layer.norm import BatchNorm3d #DEFINE_ALIAS # from .layer.rnn import RNNCell #DEFINE_ALIAS # from .layer.rnn import GRUCell #DEFINE_ALIAS # from .layer.rnn import LSTMCell #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index afc1614732d06dcef4ca0e1e75cd93e28d6a2d3d..97a4d5432bdc24912a851741328516e9269a64c2 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -160,12 +160,12 @@ from .loss import square_error_cost #DEFINE_ALIAS from .loss import ssd_loss #DEFINE_ALIAS from .loss import teacher_student_sigmoid_loss #DEFINE_ALIAS from .loss import ctc_loss #DEFINE_ALIAS -# from .norm import batch_norm #DEFINE_ALIAS # from .norm import data_norm #DEFINE_ALIAS # from .norm import group_norm #DEFINE_ALIAS -# from .norm import instance_norm #DEFINE_ALIAS from .norm import l2_normalize #DEFINE_ALIAS -# from .norm import layer_norm #DEFINE_ALIAS +from .norm import batch_norm #DEFINE_ALIAS +from .norm import instance_norm #DEFINE_ALIAS +from .norm import layer_norm #DEFINE_ALIAS from .norm import lrn #DEFINE_ALIAS from .norm import normalize #DEFINE_ALIAS # from .norm import spectral_norm #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 0b007041b4ab336ae355f5d338a0d7dca9b5380e..13e86e5712a1cd5c014517e37d3803ca24cfb6fb 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -18,16 +18,19 @@ import paddle.fluid as fluid from ...fluid.data_feeder import check_variable_and_dtype, check_type from ...fluid.layer_helper import LayerHelper from ...fluid.framework import in_dygraph_mode, core +from ...framework import create_parameter from ...fluid.layers import l2_normalize #DEFINE_ALIAS from ...fluid.layers import lrn #DEFINE_ALIAS +from ...fluid.initializer import Constant +from ...fluid.param_attr import ParamAttr +from ...fluid import core, dygraph_utils __all__ = [ - # 'batch_norm', + 'batch_norm', # 'data_norm', - # 'group_norm', - # 'instance_norm', + 'instance_norm', 'l2_normalize', - # 'layer_norm', + 'layer_norm', 'lrn', 'normalize', # 'spectral_norm' @@ -110,3 +113,286 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None): eps = out.block.create_var(dtype=out.dtype) paddle.fill_constant([1], out.dtype, epsilon, out=eps) return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name) + + +def batch_norm(x, + running_mean, + running_var, + weight, + bias, + training=False, + momentum=0.9, + epsilon=1e-05, + data_format="NCHW", + name=None): + """ + Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . + + nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm. + + Parameters: + x(Tesnor): input value. It's data type should be float32, float64. + running_mean(Tensor): running mean. + running_var(Tensor): running variance. + weight(Tensor): The weight tensor of batch_norm, can not be None. + bias(Tensor): The bias tensor of batch_norm can not be None. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False. + data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW". + name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + x = np.random.seed(123) + x = np.random.random(size=(2, 1, 2, 3)).astype('float32') + running_mean = np.random.random(size=1).astype('float32') + running_variance = np.random.random(size=1).astype('float32') + weight_data = np.random.random(size=1).astype('float32') + bias_data = np.random.random(size=1).astype('float32') + x = paddle.to_tensor(x) + rm = paddle.to_tensor(running_mean) + rv = paddle.to_tensor(running_variance) + w = paddle.to_tensor(weight_data) + b = paddle.to_tensor(bias_data) + batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b) + print batch_norm_out + """ + + assert len(x.shape) >= 2, "input dim must be larger than 1" + + # we use not training means use_global_status, more details see nn._BatchNormBase + use_global_stats = not training + # input ad out must share the memory + mean_out = running_mean + variance_out = running_var + + if in_dygraph_mode(): + # for dygraph need tuple + attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout", + data_format, "use_mkldnn", False, "fuse_with_relu", False, + "use_global_stats", use_global_stats) + batch_norm_out, _, _, _, _, _ = core.ops.batch_norm( + x, weight, bias, running_mean, running_var, mean_out, variance_out, + *attrs) + + return dygraph_utils._append_activation_in_dygraph( + batch_norm_out, act=None) + + check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], + 'BatchNorm') + + # for static need dict + attrs = { + "momentum": momentum, + "epsilon": epsilon, + "data_layout": data_format, + "use_mkldnn": False, + "fuse_with_relu": False, + "use_global_stats": use_global_stats, + } + + inputs = { + "X": [x], + "Scale": [weight], + "Bias": [bias], + "Mean": [running_mean], + "Variance": [running_var] + } + + helper = LayerHelper('batch_norm', **locals()) + + dtype = x.dtype if x.dtype is not 'float16' else 'float32' + saved_mean = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + saved_variance = helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + batch_norm_out = helper.create_variable_for_type_inference(dtype) + + outputs = { + "Y": [batch_norm_out], + "MeanOut": [running_mean], + "VarianceOut": [running_var], + "SavedMean": [saved_mean], + "SavedVariance": [saved_variance] + } + + helper.append_op( + type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) + + return helper.append_activation(batch_norm_out) + + +def layer_norm(x, + normalized_shape, + weight=None, + bias=None, + epsilon=1e-05, + name=None): + """ + see more detail in paddle.nn.LayerNorm + + Parameters: + x(Tensor): Input Tensor. It's data type should be float32, float64. + normalized_shape(int|list|tuple): Input shape from an expected input of + size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`. + If it is a single integer, this module will normalize over the last dimension + which is expected to be of that specific size. + epsilon(float, optional): The small value added to the variance to prevent + division by zero. Default: 1e-05. + weight(Tensor, optional): The weight tensor of batch_norm. Default: None. + bias(Tensor, optional): The bias tensor of batch_norm. Default: None. + name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:]) + layer_norm_out = layer_norm(x) + + print(layer_norm_out.numpy) + """ + input_shape = list(x.shape) + input_ndim = len(input_shape) + normalized_ndim = len(normalized_shape) + begin_norm_axis = input_ndim - normalized_ndim + if input_ndim < normalized_ndim or input_shape[ + begin_norm_axis:] != normalized_shape: + str_normalized_shape = str(normalized_shape) + raise ValueError('Given normalized_shape is ' + str_normalized_shape + + ', expected input with shape [*, ' + + str_normalized_shape[ + 1:] + ', but got input shape ' + str(input_shape)) + + if in_dygraph_mode(): + pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon, + 'begin_norm_axis', begin_norm_axis) + return dygraph_utils._append_activation_in_dygraph(pre_act, act=None) + + check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm') + + inputs = dict() + inputs['X'] = [x] + if weight: + inputs['Scale'] = [weight] + if bias: + inputs['Bias'] = [bias] + attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis} + + # create output + helper = LayerHelper('layer_norm', **locals()) + mean_out = helper.create_variable_for_type_inference( + dtype=x.type, stop_gradient=True) + variance_out = helper.create_variable_for_type_inference( + dtype=x.type, stop_gradient=True) + layer_norm_out = helper.create_variable_for_type_inference(x.type) + + helper.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": layer_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={"epsilon": epsilon, + "begin_norm_axis": begin_norm_axis}) + + return helper.append_activation(layer_norm_out) + + +def instance_norm(x, + running_mean=None, + running_var=None, + weight=None, + bias=None, + use_input_stats=True, + momentum=0.9, + eps=1e-05, + data_format="NCHW", + name=None): + """ + See more detail in nn.layer.InstanceNorm2d. + + Parameters: + x(Tensor): Input Tensor. It's data type should be float32, float64. + running_mean(Tensor): running mean. Default None. + running_var(Tensor): running variance. Default None. + weight(Tensor, optional): The weight tensor of instance_norm. Default: None. + bias(Tensor, optional): The bias tensor of instance_norm. Default: None. + eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + use_input_stats(bool): Default True. + data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW". + name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Returns: + None. + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + instance_norm_out = paddle.nn.functional.instancenorm(x) + + print(instance_norm_out.numpy) + + """ + + if in_dygraph_mode(): + out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps, + "momentum", momentum, "data_format", + data_format) + return out + + check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm") + + attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format} + + if weight and bias: + inputs = {"X": [x], "Scale": [weight], "Bias": [bias]} + else: + inputs = {"X": [x]} + + helper = LayerHelper('instance_norm', **locals()) + saved_mean = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + saved_variance = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + instance_norm_out = helper.create_variable_for_type_inference(x.dtype) + + outputs = { + "Y": [instance_norm_out], + "SavedMean": [saved_mean], + "SavedVariance": [saved_variance] + } + + helper.append_op( + type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs) + return instance_norm_out diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index bb0bd5f70f1f9006285a3fb200537d35b4cf6c30..d30547ffdbe357e5524573df631d2fe02a35eb37 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -41,6 +41,7 @@ from ...fluid import core from ...fluid.framework import in_dygraph_mode from ...fluid.param_attr import ParamAttr from ...fluid.initializer import Constant +from paddle.framework import get_default_dtype from .. import functional as F @@ -423,7 +424,7 @@ class PReLU(layers.Layer): For more information, please refer to :ref:`api_guide_Name`. Shape: - - input: Tensor with any shape. + - input: Tensor with any shape. Default dtype is float32. - output: Tensor with the same shape as input. Examples: @@ -433,13 +434,14 @@ class PReLU(layers.Layer): import numpy as np paddle.disable_static() + paddle.set_default_dtype("float64") data = np.array([[[[-2.0, 3.0, -4.0, 5.0], [ 3.0, -4.0, 5.0, -6.0], [-7.0, -8.0, 8.0, 9.0]], [[ 1.0, -2.0, -3.0, 4.0], [-5.0, 6.0, 7.0, -8.0], - [ 6.0, 7.0, 8.0, 9.0]]]], 'float32') + [ 6.0, 7.0, 8.0, 9.0]]]], 'float64') x = paddle.to_tensor(data) m = paddle.nn.PReLU(1, 0.25) out = m(x) @@ -461,10 +463,10 @@ class PReLU(layers.Layer): self._weight = self.create_parameter( attr=self._weight_attr, - shape=[num_parameters], - dtype='float32', + shape=[self._num_parameters], + dtype=get_default_dtype(), is_bias=False, - default_initializer=Constant(init)) + default_initializer=Constant(self._init)) def forward(self, x): return F.prelu(x, self._weight) diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 369d462a8089a30e6b749ef472aad66166cb590d..c7855b23bf6e6861326533e3cc93d7f7c5bd4ca2 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -1,4 +1,17 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,28 +27,877 @@ # TODO: define normalization api -import warnings from ...fluid.dygraph.nn import InstanceNorm from ...fluid.dygraph import BatchNorm #DEFINE_ALIAS -from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS -from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS +#from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS + +#from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS from ...fluid.dygraph import SpectralNorm #DEFINE_ALIAS from ...fluid.dygraph import layers + +from ...framework import get_default_dtype, set_default_dtype from ...fluid.framework import in_dygraph_mode from ...fluid.initializer import Constant from ...fluid.param_attr import ParamAttr from ...fluid.data_feeder import check_variable_and_dtype, check_type -from ...fluid import core +from ...fluid import core, dygraph_utils + +from ..functional import batch_norm, layer_norm, instance_norm + +import numpy as np +import numbers +import warnings __all__ = [ 'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm', - 'SyncBatchNorm' + 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d', + 'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm' ] +class _InstanceNormBase(layers.Layer): + """ + This class is based class for InstanceNorm1d, 2d, 3d. + + See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details. + """ + + def __init__(self, + num_features, + epsilon=1e-5, + momentum=0.9, + weight_attr=None, + bias_attr=None, + track_running_stats=False, + data_format="NCHW", + name=None): + super(_InstanceNormBase, self).__init__() + + if weight_attr == False or bias_attr == False: + assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm" + self._epsilon = epsilon + self._weight_attr = weight_attr + self._bias_attr = bias_attr + + if weight_attr != False and bias_attr != False: + self.scale = self.create_parameter( + attr=self._weight_attr, + shape=[num_features], + default_initializer=Constant(1.0), + is_bias=False) + self.bias = self.create_parameter( + attr=self._bias_attr, + shape=[num_features], + default_initializer=Constant(0.0), + is_bias=True) + else: + self.scale = None + self.bias = None + + def _check_input_dim(self, input): + raise NotImplementedError("InstanceNorm Base error") + + def forward(self, input): + self._check_input_dim(input) + + return instance_norm( + input, weight=self.scale, bias=self.bias, eps=self._epsilon) + + +class InstanceNorm1d(_InstanceNormBase): + """ + Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization . + + DataLayout: NCL `[batch, in_channels, length]` + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ + \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + Note: + `H` means height of feature map, `W` means width of feature map. + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): A value added to the denominator for + numerical stability. Default is 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + track_running_stats(bool, optional): Whether to use global mean and + variance. In train mode, when setting track_running_stats True, the global mean + and variance are also used during train period. Default: False. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr. + If the Initializer of the weight_attr is not set, the parameter is initialized + one. If it is set to False, will not create weight_attr. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm. + If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + If it is set to False, will not create bias_attr. Default: None. + data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL". + name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + + Shape: + - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length). + - output: 3-D tensor with same shape as input x. + + Returns: + None. + + **Note**: + Momentum and track_running_stats is not effective. The next version will fix the problem . + + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + instance_norm = paddle.nn.InstanceNorm1d(2) + instance_norm_out = instance_norm(x) + + print(instance_norm_out.numpy) + + """ + + def _check_input_dim(self, input): + if len(input.shape) != 2 and len(input.shape) != 3: + raise ValueError('expected 2D or 3D input (got {}D input)'.format( + len(input.shape))) + + +class InstanceNorm2d(_InstanceNormBase): + """ + Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization . + + DataLayout: NCHW `[batch, in_channels, in_height, in_width]` + + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ + \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + Note: + `H` means height of feature map, `W` means width of feature map. + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): A value added to the denominator for + numerical stability. Default is 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + track_running_stats(bool, optional): Whether to use global mean and + variance. In train mode, when setting track_running_stats True, the global mean + and variance are also used during train period. Default: False. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr. + If the Initializer of the weight_attr is not set, the parameter is initialized + one. If it is set to False, will not create weight_attr. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm. + If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + If it is set to False, will not create bias_attr. Default: None. + data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW. + name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 4-D tensor with shape: (batch, num_features, height, weight). + - output: 4-D tensor with same shape as input x. + + Returns: + None. + + **Note**: + Momentum and track_running_stats is not effective. The next version will fix the problem . + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + instance_norm = paddle.nn.InstanceNorm2d(2) + instance_norm_out = instance_norm(x) + + print(instance_norm_out.numpy) + """ + + def _check_input_dim(self, input): + if len(input.shape) != 4: + raise ValueError('expected 4D input (got {}D input)'.format( + len(input.shape))) + + +class InstanceNorm3d(_InstanceNormBase): + """ + Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization . + + DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]` + + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ + \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + Note: + `H` means height of feature map, `W` means width of feature map. + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): A value added to the denominator for + numerical stability. Default is 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + track_running_stats(bool, optional): Whether to use global mean and + variance. In train mode, when setting track_running_stats True, the global mean + and variance are also used during train period. Default: False. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr. + If the Initializer of the weight_attr is not set, the parameter is initialized + one. If it is set to False, will not create weight_attr. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm. + If it is set to None or one attribute of ParamAttr, instance_norm + will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + If it is set to False, will not create bias_attr. Default: None. + data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW. + name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 5-D tensor with shape: (batch, num_features, dims, height, weight). + - output: 5-D tensor with same shape as input x. + + Returns: + None. + + **Note**: + Momentum and track_running_stats is not effective. The next version will fix the problem . + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + instance_norm = paddle.nn.InstanceNorm3d(2) + instance_norm_out = instance_norm(x) + + print(instance_norm_out.numpy) + """ + + def _check_input_dim(self, input): + if len(input.shape) != 5: + raise ValueError('expected 5D input (got {}D input)'.format( + len(input.shape))) + + +class GroupNorm(layers.Layer): + """ + This interface is used to construct a callable object of the ``GroupNorm`` class. + For more details, refer to code examples. + It implements the function of the Group Normalization Layer. + Refer to `Group Normalization `_ . + + Parameters: + num_channels(int): The number of channels of input. + num_groups(int): The number of groups that divided from channels. + epsilon(float, optional): The small value added to the variance to prevent + division by zero. Default: 1e-05. + weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable + scale :math:`g`. If it is set to False, no scale will be added to the output units. + If it is set to None, the bias is initialized one. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable + bias :math:`b`. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW. + name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 4-D tensor with shape: (batch, num_features, height, weight). + - output: 4-D tensor with same shape as input x. + + Returns: + None + + Examples: + .. code-block:: python + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32') + x = paddle.to_tensor(x_data) + group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6) + group_norm_out = group_norm(x) + + print(group_norm_out.numpy) + """ + + def __init__(self, + num_channels, + num_groups, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + data_layout='NCHW', + name=None): + super(GroupNorm, self).__init__() + self._weight_attr = weight_attr + self._bias_attr = bias_attr + self._epsilon = epsilon + self._num_channels = num_channels + self._num_groups = num_groups + if data_layout != 'NCHW': + raise ValueError("unsupported data layout:" + data_layout) + + param_shape = [self._num_channels] + + self.weight = self.create_parameter( + attr=self._weight_attr or False, + shape=param_shape, + default_initializer=Constant(1.0)) + + self.bias = self.create_parameter( + attr=self._weight_attr or False, shape=param_shape, is_bias=True) + + def forward(self, input): + inputs = {'X': input} + if self.bias is not None: + inputs['Bias'] = self.bias + if self.weight is not None: + inputs['Scale'] = self.weight + + # create output + mean_out = self._helper.create_variable_for_type_inference( + dtype=input.dtype, stop_gradient=True) + variance_out = self._helper.create_variable_for_type_inference( + dtype=input.dtype, stop_gradient=True) + group_norm_out = self._helper.create_variable_for_type_inference( + dtype=input.dtype) + + self._helper.append_op( + type="group_norm", + inputs=inputs, + outputs={ + "Y": group_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={"epsilon": self._epsilon, + "groups": self._num_groups}) + + return self._helper.append_activation(group_norm_out, None) + + +class LayerNorm(layers.Layer): + """ + :alias_main: paddle.nn.LayerNorm + :alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm + :old_api: paddle.fluid.dygraph.LayerNorm + + This interface is used to construct a callable object of the ``LayerNorm`` class. + For more details, refer to code examples. + It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data. + Refer to `Layer Normalization `_ + + The formula is as follows: + + .. math:: + + \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i + + \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon} + + y & = f(\\frac{g}{\\sigma}(x - \\mu) + b) + + - :math:`x`: the vector representation of the summed inputs to the neurons in that layer. + - :math:`H`: the number of hidden units in a layers + - :math:`\\epsilon`: the small value added to the variance to prevent division by zero. + - :math:`g`: the trainable scale parameter. + - :math:`b`: the trainable bias parameter. + + Parameters: + normalized_shape(int|list|tuple): Input shape from an expected input of + size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`. + If it is a single integer, this module will normalize over the last dimension + which is expected to be of that specific size. + epsilon(float, optional): The small value added to the variance to prevent + division by zero. Default: 1e-05. + weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable + gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The + :attr:`param_attr` is initialized as 1 if it is added. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable + bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The + :attr:`bias_attr` is initialized as 0 if it is added. Default: None. + name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 2-D, 3-D, 4-D or 5-D tensor. + - output: same shape as input x. + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + layer_norm = paddle.nn.LayerNorm(x_data.shape[1:]) + layer_norm_out = layer_norm(x) + + print(layer_norm_out.numpy) + """ + + def __init__(self, + normalized_shape, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + name=None): + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + normalized_shape = [normalized_shape] + + self._normalized_shape = list(normalized_shape) + self._epsilon = epsilon + self._weight_attr = weight_attr + self._bias_attr = bias_attr + param_shape = [np.prod(self._normalized_shape)] + + if weight_attr is False: + self.weight = None + else: + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + + if bias_attr is False: + self.bias = None + else: + self.bias = self.create_parameter( + attr=self._bias_attr, shape=param_shape, is_bias=True) + + def forward(self, input): + return layer_norm( + input, + normalized_shape=self._normalized_shape, + weight=self.weight, + bias=self.bias, + epsilon=self._epsilon) + + +class _BatchNormBase(layers.Layer): + """ + BatchNorm base . + """ + + def __init__(self, + num_features, + momentum=0.9, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + data_format='NCHW', + track_running_stats=True, + name=None): + super(_BatchNormBase, self).__init__() + self._num_features = num_features + self._weight_attr = weight_attr + self._bias_attr = bias_attr + + if get_default_dtype() == 'float16': + set_default_dtype('float32') + + param_shape = [num_features] + + # create parameter + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = (self._weight_attr is False) or ( + self._weight_attr and self._weight_attr.learning_rate == 0.) + + self.bias = self.create_parameter( + attr=self._bias_attr, shape=param_shape, is_bias=True) + self.bias.stop_gradient = (self._bias_attr is False) or ( + self._bias_attr and self._bias_attr.learning_rate == 0.) + + moving_mean_name = None + moving_variance_name = None + + if name is not None: + moving_mean_name = name + "_mean" + moving_variance_name = name + "_variance" + + self._mean = self.create_parameter( + attr=ParamAttr( + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + do_model_average=True), + shape=param_shape, + dtype=self._dtype) + self._mean.stop_gradient = True + + self._variance = self.create_parameter( + attr=ParamAttr( + name=moving_variance_name, + initializer=Constant(1.0), + trainable=False, + do_model_average=True), + shape=param_shape, + dtype=self._dtype) + self._variance.stop_gradient = True + + self._data_format = data_format + self._in_place = False + self._momentum = momentum + self._epsilon = epsilon + self._fuse_with_relu = False + self._track_running_stats = track_running_stats + + def _check_input_dim(self, input): + raise NotImplementedError("BatchNorm Base error") + + def forward(self, input): + + self._check_input_dim(input) + + if not self.training and not self._track_running_stats: + raise ValueError( + 'When inference, expected track_running_stats is True.') + + if self.training and not self._track_running_stats: + warnings.warn( + "When training, we now always track global mean and variance.") + + return batch_norm( + input, + self._mean, + self._variance, + weight=self.weight, + bias=self.bias, + training=self.training, + momentum=self._momentum, + epsilon=self._epsilon, + data_format=self._data_format) + + +class BatchNorm1d(_BatchNormBase): + """ + Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . + + When track_running_stats = False, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + Calculated as follows: + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + + When track_running_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global or running statistics (moving_mean and moving_variance). It usually got from the + pre-trained model. Calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + + The normalization function formula is as follows: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\\gamma` : trainable proportional parameter + - :math:`\\beta` : trainable deviation parameter + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. + data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL". + track_running_stats(bool, optional): Whether to use global mean and variance. In train period, + True will track global mean and variance used for inference. When inference, track_running_stats must be + True. Default: True. + name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length). + - output: 3-D tensor with same shape as input x. + + Returns: + None. + + **Note**: + Now track_running_stats is actucal always true. The next version will fix the problem . + + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 1, 3)).astype('float32') + x = paddle.to_tensor(x_data) + batch_norm = paddle.nn.BatchNorm1d(1) + batch_norm_out = batch_norm(x) + + print(batch_norm_out.numpy) + """ + + def _check_input_dim(self, input): + if len(input.shape) != 2 and len(input.shape) != 3: + raise ValueError('expected 2D or 3D input (got {}D input)'.format( + len(input.shape))) + + +class BatchNorm2d(_BatchNormBase): + """ + Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . + + When track_running_stats = False, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + Calculated as follows: + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + + When track_running_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global or running statistics (moving_mean and moving_variance). It usually got from the + pre-trained model. Calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + + The normalization function formula is as follows: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\\gamma` : trainable proportional parameter + - :math:`\\beta` : trainable deviation parameter + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. + data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW. + track_running_stats(bool, optional): Whether to use global mean and variance. In train period, + True will track global mean and variance used for inference. When inference, track_running_stats must be + True. Default: True. + name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 4-D tensor with shape: (batch, num_features, height, weight). + - output: 4-D tensor with same shape as input x. + + Returns: + None + + **Note**: + Now track_running_stats is actucal always true. The next version will fix the problem . + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + batch_norm = paddle.nn.BatchNorm2d(1) + batch_norm_out = batch_norm(x) + + print(batch_norm_out.numpy) + """ + + def _check_input_dim(self, input): + if len(input.shape) != 4: + raise ValueError('expected 4D input (got {}D input)'.format( + len(input.shape))) + + +class BatchNorm3d(_BatchNormBase): + """ + Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . + + When track_running_stats = False, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + Calculated as follows: + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + + When track_running_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global or running statistics (moving_mean and moving_variance). It usually got from the + pre-trained model. Calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + + The normalization function formula is as follows: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\\gamma` : trainable proportional parameter + - :math:`\\beta` : trainable deviation parameter + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. + data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW. + track_running_stats(bool, optional): Whether to use global mean and variance. In train period, + True will track global mean and variance used for inference. When inference, track_running_stats must be + True. Default: True. + name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. + + Shape: + - x: 5-D tensor with shape: (batch, num_features, dims, height, weight). + - output: 5-D tensor with same shape as input x. + + Returns: + None + + **Note**: + Now track_running_stats is actucal always true. The next version will fix the problem . + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + np.random.seed(123) + x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32') + x = paddle.to_tensor(x_data) + batch_norm = paddle.nn.BatchNorm3d(1) + batch_norm_out = batch_norm(x) + + print(batch_norm_out.numpy) + """ + + def _check_input_dim(self, input): + if len(input.shape) != 5: + raise ValueError('expected 5D input (got {}D input)'.format( + len(input.shape))) + + class SyncBatchNorm(layers.Layer): """ This interface is used to construct a callable object of the ``SyncBatchNorm`` class. diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py index d01e62abaa6374e7fde892c6ae52c16b4b0f13e2..4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b 100644 --- a/python/paddle/optimizer/lr_scheduler.py +++ b/python/paddle/optimizer/lr_scheduler.py @@ -153,7 +153,7 @@ class NoamLR(_LRScheduler): warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``NoamLR`` instance to schedule learning rate. @@ -168,14 +168,14 @@ class NoamLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() @@ -185,14 +185,13 @@ class NoamLR(_LRScheduler): main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.NoamLR(d_model=0.01, warmup_steps=100, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -204,7 +203,7 @@ class NoamLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -251,7 +250,7 @@ class PiecewiseLR(_LRScheduler): values(list): A list of learning rate values that will be picked during different epoch boundaries. The type of element in the list is python float. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``PiecewiseLR`` instance to schedule learning rate. @@ -267,14 +266,14 @@ class PiecewiseLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) + scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() @@ -284,14 +283,13 @@ class PiecewiseLR(_LRScheduler): main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) + scheduler = paddle.optimizer.lr_scheduler.PiecewiseLR(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -303,7 +301,7 @@ class PiecewiseLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -336,7 +334,7 @@ class NaturalExpLR(_LRScheduler): learning_rate (float): The initial learning rate. It is a python float number. gamma (float, optional): A Ratio to update the learning rate. Default: 0.1. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``NaturalExpLR`` instance to schedule learning rate. @@ -352,14 +350,14 @@ class NaturalExpLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() @@ -369,14 +367,13 @@ class NaturalExpLR(_LRScheduler): main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.NaturalExpLR(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -388,7 +385,7 @@ class NaturalExpLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -416,7 +413,7 @@ class InverseTimeLR(_LRScheduler): gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . It should be less than 1.0. Default: 0.1. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``InverseTimeLR`` instance to schedule learning rate. @@ -432,14 +429,14 @@ class InverseTimeLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() @@ -449,14 +446,13 @@ class InverseTimeLR(_LRScheduler): main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.InverseTimeLR(learning_rate=0.5, gamma=0.1, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -468,7 +464,7 @@ class InverseTimeLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -513,7 +509,7 @@ class PolynomialLR(_LRScheduler): cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease to ``end_lr`` . If False, the learning rate is monotone decreasing. Default: False. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``PolynomialLR`` instance to schedule learning rate. @@ -529,31 +525,30 @@ class PolynomialLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.PolynomialLR(learning_rate=0.5, decay_steps=20, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -565,7 +560,7 @@ class PolynomialLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -629,7 +624,7 @@ class LinearLrWarmup(_LRScheduler): start_lr (float): Initial learning rate of warm up. end_lr (float): Final learning rate of warm up. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``LinearLrWarmup`` instance to schedule learning rate. @@ -653,25 +648,24 @@ class LinearLrWarmup(_LRScheduler): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.LinearLrWarmup( + scheduler = paddle.optimizer.lr_scheduler.LinearLrWarmup( learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -683,7 +677,7 @@ class LinearLrWarmup(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -733,10 +727,10 @@ class ExponentialLR(_LRScheduler): Args: learning_rate (float): The initial learning rate. It is a python float number. - gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . - It should be less than 1.0. Default: 0.1. + gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``ExponentialLR`` instance to schedule learning rate. @@ -752,31 +746,30 @@ class ExponentialLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.ExponentialLR(learning_rate=0.5, gamma=0.9, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -788,7 +781,7 @@ class ExponentialLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -824,7 +817,7 @@ class MultiStepLR(_LRScheduler): gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . It should be less than 1.0. Default: 0.1. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: @@ -841,31 +834,30 @@ class MultiStepLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.MultiStepLR(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -877,7 +869,7 @@ class MultiStepLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -934,7 +926,7 @@ class StepLR(_LRScheduler): gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . It should be less than 1.0. Default: 0.1. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``StepLR`` instance to schedule learning rate. @@ -951,31 +943,30 @@ class StepLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.StepLR(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -987,7 +978,7 @@ class StepLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -1032,7 +1023,7 @@ class LambdaLR(_LRScheduler): learning_rate (float): The initial learning rate. It is a python float number. lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``LambdaLR`` instance to schedule learning rate. @@ -1048,31 +1039,30 @@ class LambdaLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.LambdaLR(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -1084,7 +1074,7 @@ class LambdaLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ @@ -1130,8 +1120,8 @@ class ReduceLROnPlateau(_LRScheduler): change of ``loss`` is ``threshold`` . Default: ``'rel'`` . cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0. min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0. - epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is - ignored. Default: 1e-8. + epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, + the update is ignored. Default: 1e-8. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``. @@ -1149,31 +1139,30 @@ class ReduceLROnPlateau(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step(loss) - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.ReduceLROnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -1185,7 +1174,7 @@ class ReduceLROnPlateau(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step(out[0]) """ @@ -1351,7 +1340,7 @@ class CosineAnnealingLR(_LRScheduler): T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. - verbose (bool): If ``True``, prints a message to stdout for each update. Default: ``False`` . + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . Returns: ``CosineAnnealingLR`` instance to schedule learning rate. @@ -1367,31 +1356,30 @@ class CosineAnnealingLR(_LRScheduler): paddle.disable_static() x = np.random.uniform(-1, 1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) - scheduler = paddle.optimizer.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameter_list=linear.parameters()) for epoch in range(20): for batch_id in range(2): x = paddle.to_tensor(x) out = linear(x) loss = paddle.reduce_mean(out) - out.backward() + loss.backward() sgd.minimize(loss) linear.clear_gradients() scheduler.step() - # train on statich mode + # train on static mode paddle.enable_static() main_prog = paddle.static.Program() start_prog = paddle.static.Program() with paddle.static.program_guard(main_prog, start_prog): - x = paddle.static.data(name='x', shape=[-1, 4, 5]) - y = paddle.static.data(name='y', shape=[-1, 4, 5]) + x = paddle.static.data(name='x', shape=[None, 4, 5]) + y = paddle.static.data(name='y', shape=[None, 4, 5]) z = paddle.static.nn.fc(x, 100) loss = paddle.mean(z) - scheduler = paddle.optimizer.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) + scheduler = paddle.optimizer.lr_scheduler.CosineAnnealingLR(learning_rate=0.5, T_max=10, verbose=True) sgd = paddle.optimizer.SGD(learning_rate=scheduler) sgd.minimize(loss) - lr_var = sgd._global_learning_rate() exe = paddle.static.Executor() exe.run(start_prog) @@ -1403,7 +1391,7 @@ class CosineAnnealingLR(_LRScheduler): 'x': np.random.randn(3, 4, 5).astype('float32'), 'y': np.random.randn(3, 4, 5).astype('float32') }, - fetch_list=lr_var.name) + fetch_list=loss.name) scheduler.step() """ diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 65469759a38087b2919ada6e73aebaaadf93c905..44ec0a5a4df985d5217011a841065ce504483ab7 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -27,7 +27,6 @@ from ..fluid.layers import expand_as #DEFINE_ALIAS from ..fluid.layers import slice #DEFINE_ALIAS from ..fluid.layers import strided_slice #DEFINE_ALIAS from ..fluid.layers import transpose #DEFINE_ALIAS -from ..fluid.layers import unique #DEFINE_ALIAS from ..fluid.layers import unstack #DEFINE_ALIAS from ..fluid.layers import scatter_nd_add #DEFINE_ALIAS @@ -608,6 +607,126 @@ def squeeze(x, axis=None, name=None): return layers.squeeze(x, axis, name) +def unique(x, + return_index=False, + return_inverse=False, + return_counts=False, + axis=None, + name=None): + """ + Returns the unique elements of `x` in ascending order. + + Args: + x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64. + return_index(bool, optional): If True, also return the indices of the input tensor that + result in the unique Tensor. + return_inverse(bool, optional): If True, also return the indices for where elements in + the original input ended up in the returned unique tensor. + return_counts(bool, optional): If True, also return the counts for each unique element. + axis(int, optional): The axis to apply unique. If None, the input will be flattened. + Default: None. + name(str, optional): Name for the operation. For more information, please refer to + :ref:`api_guide_Name`. Default: None. + + Returns: + tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \ + provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \ + is True. `counts` is provided only if `return_counts` is True. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + + paddle.disable_static() + x_data = np.array([2, 3, 3, 1, 5, 3]) + x = paddle.to_tensor(x_data) + unique = paddle.unique(x) + np_unique = unique.numpy() # [1 2 3 5] + _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True) + np_indices = indices.numpy() # [3 0 1 4] + np_inverse = inverse.numpy() # [1 2 2 0 3 2] + np_counts = counts.numpy() # [1 1 3 1] + + x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]]) + unique = paddle.unique(x) + np_unique = unique.numpy() # [0 1 2 3] + + unique = paddle.unique(x, axis=0) + np_unique = unique.numpy() + # [[2 1 3] + # [3 0 1]] + """ + if axis is None: + axis = [] + else: + axis = [axis] + + if in_dygraph_mode(): + out, inverse, indices, counts = core.ops.unique( + x, 'dtype', + convert_np_dtype_to_dtype_('int32'), 'return_index', return_index, + 'return_inverse', return_inverse, 'return_counts', return_counts, + 'axis', axis, "is_sorted", True) + outs = [out] + if return_index: + outs.append(indices) + if return_inverse: + outs.append(inverse) + if return_counts: + outs.append(counts) + + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + check_variable_and_dtype(x, "input", + ['float32', 'float64', 'int32', 'int64'], 'unique') + check_type(return_index, 'return_index', bool, 'unique') + check_type(return_inverse, 'return_inverse', bool, 'unique') + check_type(return_counts, 'return_counts', bool, 'unique') + if len(axis) != 0: + check_type(axis[0], 'axis', int, 'unique') + + helper = LayerHelper('unique', **locals()) + attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + "return_index": return_index, + "return_inverse": return_inverse, + "return_counts": return_counts, + "axis": axis, + "is_sorted": True + } + out = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + inverse = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.INT64, stop_gradient=True) + outputs = {"Out": out, "Index": inverse} + outs = [out] + if return_index: + indices = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.INT64, stop_gradient=True) + outputs["Indices"] = indices + outs.append(indices) + if return_inverse: + outs.append(inverse) + if return_counts: + counts = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.INT64, stop_gradient=True) + outputs["Counts"] = counts + outs.append(counts) + + helper.append_op( + type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs) + + if len(outs) == 1: + return outs[0] + + return tuple(outs) + + def unsqueeze(x, axis, name=None): """ :alias_main: paddle.unsqueeze diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py old mode 100644 new mode 100755 index 85441597cf56c61ef33d552ec6a9f7c5019b4ec8..4c7eef5fa65108b35cf9792280eeb6c1e7ddf3fc --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -562,34 +562,52 @@ floor_mod = remainder #DEFINE_ALIAS def multiply(x, y, axis=-1, name=None): """ - :alias_main: paddle.multiply - :alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply + multiply two tensors element-wise. The equation is: -Examples: + .. math:: + out = x * y - .. code-block:: python + **Note**: + ``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` . - import paddle - import numpy as np + Args: + x (Tensor): the input tensor, its data type should be float32, float64, int32, int64. + y (Tensor): the input tensor, its data type should be float32, float64, int32, int64. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - paddle.disable_static() - x_data = np.array([[1, 2], [3, 4]], dtype=np.float32) - y_data = np.array([[5, 6], [7, 8]], dtype=np.float32) - x = paddle.to_variable(x_data) - y = paddle.to_variable(y_data) - res = paddle.multiply(x, y) - print(res.numpy()) # [[5, 12], [21, 32]] + Returns: + N-D Tensor. A location into which the result is stored. Its dimension equals with $x$. - x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32) - y_data = np.array([1, 2], dtype=np.float32) - x = paddle.to_variable(x_data) - y = paddle.to_variable(y_data) - res = paddle.multiply(x, y, axis=1) - print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]] + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + x_data = np.array([[1, 2], [3, 4]], dtype=np.float32) + y_data = np.array([[5, 6], [7, 8]], dtype=np.float32) + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + res = paddle.multiply(x, y) + print(res.numpy()) # [[5, 12], [21, 32]] + + x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32) + y_data = np.array([1, 2], dtype=np.float32) + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + res = paddle.multiply(x, y, axis=1) + print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]] """ op_type = 'elementwise_mul' act = None + if x.dtype != y.dtype: + raise TypeError( + 'Input tensors must be same type, but received type of x: %s, type of y: %s ' + % (x.dtype, y.dtype)) + if in_dygraph_mode(): return _elementwise_op_in_dygraph( x, y, axis=axis, act=act, op_name=op_type) diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 91ad3bfa9cc1babd22cf9419ede33ae26f0dc900..eede022e05ba61bc23da517e7af7cd2eb58f5416 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -21,7 +21,6 @@ from ..fluid import core, layers from ..fluid.layers import argmin #DEFINE_ALIAS from ..fluid.layers import has_inf #DEFINE_ALIAS from ..fluid.layers import has_nan #DEFINE_ALIAS -from ..fluid.layers import topk #DEFINE_ALIAS __all__ = [ 'argmax', @@ -756,3 +755,100 @@ def masked_select(x, mask, name=None): type='masked_select', inputs={'X': x, 'Mask': mask}, outputs={'Y': out}) return out + + +def topk(x, k, axis=None, largest=True, sorted=True, name=None): + """ + This OP is used to find values and indices of the k largest or smallest at the optional axis. + If the input is a 1-D Tensor, finds the k largest or smallest values and indices. + If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`. + + Args: + x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64. + k(int, Tensor): The number of top elements to look for along the axis. + axis(int, optional): Axis to compute indices along. The effective range + is [-R, R), where R is x.ndim. when axis < 0, it works the same way + as axis + R. Default is -1. + largest(bool, optional) : largest is a flag, if set to true, + algorithm will sort by descending order, otherwise sort by + ascending order. Default is True. + sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + + paddle.disable_static() + + data_1 = np.array([1, 4, 5, 7]) + tensor_1 = paddle.to_tensor(data_1) + value_1, indices_1 = paddle.topk(tensor_1, k=1) + print(value_1.numpy()) + # [7] + print(indices_1.numpy()) + # [3] + data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]]) + tensor_2 = paddle.to_tensor(data_2) + value_2, indices_2 = paddle.topk(tensor_2, k=1) + print(value_2.numpy()) + # [[7] + # [6]] + print(indices_2.numpy()) + # [[3] + # [1]] + value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1) + print(value_3.numpy()) + # [[7] + # [6]] + print(indices_3.numpy()) + # [[3] + # [1]] + value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0) + print(value_4.numpy()) + # [[2 6 5 7]] + print(indices_4.numpy()) + # [[1 1 0 0]] + + """ + if in_dygraph_mode(): + k = k.numpy().item(0) if isinstance(k, Variable) else k + if axis is None: + out, indices = core.ops.top_k_v2(x, 'k', + int(k), 'largest', largest, + 'sorted', sorted) + else: + out, indices = core.ops.top_k_v2(x, 'k', + int(k), 'axis', axis, 'largest', + largest, 'sorted', sorted) + return out, indices + + helper = LayerHelper("top_k_v2", **locals()) + inputs = {"X": [x]} + attrs = {} + if isinstance(k, Variable): + inputs['K'] = [k] + else: + attrs = {'k': k} + attrs['largest'] = largest + attrs['sorted'] = sorted + if axis is not None: + attrs['axis'] = axis + + values = helper.create_variable_for_type_inference(dtype=x.dtype) + indices = helper.create_variable_for_type_inference(dtype="int64") + + helper.append_op( + type="top_k_v2", + inputs=inputs, + outputs={"Out": [values], + "Indices": [indices]}, + attrs=attrs) + indices.stop_gradient = True + return values, indices diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos index 049621b9388997deaeab618c09c579858a60d47e..b10e76a4b4d037bfa0d72e74e660cf696f5ee1d3 100644 --- a/tools/dockerfile/Dockerfile.centos +++ b/tools/dockerfile/Dockerfile.centos @@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o go get github.com/Masterminds/glide && \ rm -rf /root/requirements.txt -RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python +RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \ + LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \ + LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \ + LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \ + LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \ + LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index aa547fe74163fdbe08c1266817b7f1903e541ad5..9fe58885fa553671cf5c08bd51295f271f4df668 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 --no-cache-dir install opencv-python && \ + pip3 --no-cache-dir install opencv-python==4.2.0.32 && \ pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \ pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.7 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \ pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.8 --no-cache-dir install opencv-python && \ + pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \ pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip --no-cache-dir install opencv-python + pip --no-cache-dir install opencv-python==4.2.0.32 #For docstring checker RUN pip3 --no-cache-dir install pylint pytest astroid isort && \ diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh index 6f201a8579fea29ec6eaabf1faca77da26b11882..9f937cf9343784f10d186dd5bdcbace6f8a4e0e9 100755 --- a/tools/dockerfile/build_scripts/build_utils.sh +++ b/tools/dockerfile/build_scripts/build_utils.sh @@ -89,7 +89,7 @@ function do_cpython_build { fi # NOTE Make libpython shared library visible to python calls below LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py - LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel + LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2 cd / ls ${MY_DIR} local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py) diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 index ffef02dba4614f7bbbe13ebc30b40438a52b4590..e3a3374b943bc955d54afbef9755ed5147fad7d2 100644 --- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 +++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 @@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ARG WITH_GPU ARG WITH_AVX -ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} @@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure] -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) # ar mishandles 4GB files # https://sourceware.org/bugzilla/show_bug.cgi?id=14625 diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 index 837f0e486f6112bfc645c55ded8dfd0726d414d6..c27fdcea2401c26b1ef1dd377c42930b6e74fcf0 100644 --- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 +++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 @@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ARG WITH_GPU ARG WITH_AVX -ENV WOBOQ OFF ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} @@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure] RUN pip --no-cache-dir install certifi urllib3[secure] -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) # ar mishandles 4GB files # https://sourceware.org/bugzilla/show_bug.cgi?id=14625 diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index 102b50c43aeabc6ab2c67840edfaf42615cf51f5..033b4b8723aa30465cdb07198f470d7c09a0f326 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -480,14 +480,8 @@ def get_filenames(): filename = '' print("\nWARNING:----Exception in get api filename----\n") print("\n" + api + ' module is ' + module + "\n") - if filename != '': - # rm contrib file - if filename.startswith( - '../python/paddle/fluid/contrib' - ) or filename == '../python/paddle/verison.py': - pass - elif filename not in filenames: - filenames.append(filename) + if filename != '' and filename not in filenames: + filenames.append(filename) # get all methods method = '' if inspect.isclass(eval(api)): @@ -557,14 +551,18 @@ def get_wlist(): ''' wlist = [] + wlist_file = [] with open("wlist.json", 'r') as load_f: load_dict = json.load(load_f) for key in load_dict: - wlist = wlist + load_dict[key] - return wlist + if key == 'wlist_file': + wlist_file = wlist_file + load_dict[key] + else: + wlist = wlist + load_dict[key] + return wlist, wlist_file -wlist = get_wlist() +wlist, wlist_file = get_wlist() if len(sys.argv) < 2: print("Error: inadequate number of arguments") @@ -590,8 +588,14 @@ else: if len(filenames) == 0 and len(whl_error) == 0: print("-----API_PR.spec is the same as API_DEV.spec-----") exit(0) - elif '../python/paddle/fluid/core_avx.py' in filenames: - filenames.remove('../python/paddle/fluid/core_avx.py') + rm_file = [] + for f in filenames: + for w_file in wlist_file: + if f.startswith(w_file): + rm_file.append(f) + filenames.remove(f) + if len(rm_file) != 0: + print("REMOVE white files: %s" % rm_file) print("API_PR is diff from API_DEV: %s" % filenames) one_part_filenum = int(math.ceil(len(filenames) / cpus)) if one_part_filenum == 0: diff --git a/tools/wlist.json b/tools/wlist.json index 6a0360fbcd9d06885d8dff7044e0cce63fa7d92c..c6114918e5932a9cfd139fd0212698c5ea97d3cc 100644 --- a/tools/wlist.json +++ b/tools/wlist.json @@ -1,4 +1,10 @@ { + "wlist_file" : [ + "../python/paddle/fluid/contrib", + "../python/paddle/verison.py", + "../python/paddle/fluid/core_avx.py", + "../python/paddle/distributed" + ], "wlist_inneed":[ "append_LARS", "BuildStrategy.debug_graphviz_path",