提交 6ad85cbd 编写于 作者: J jingqinghe
......@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU
ARG WITH_AVX
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
......@@ -149,21 +148,11 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue.
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
......@@ -184,9 +173,9 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage
RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage
RUN pip --no-cache-dir install coverage
COPY ./python/requirements.txt /root/
......@@ -204,12 +193,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release . \
make)
# ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
......
......@@ -110,10 +110,12 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET}
SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
endif()
set(dst_dir "${DST}/third_party/install/xxhash")
copy(${TARGET}
......
......@@ -117,7 +117,7 @@ static void TransData(const framework::LoDTensor &src_item,
TensorCopy(src_item, platform::CPUPlace(), dst_item);
#endif
} else {
dst_item->ShareDataWith(src_item);
TensorCopy(src_item, platform::CPUPlace(), dst_item);
}
} else {
dst_item->clear();
......
......@@ -113,7 +113,9 @@ message DistributedStrategy {
optional bool fuse_all_reduce_ops = 18 [ default = true ];
optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
// optional bool enable_backward_optimizer_op_deps = 19 [ default = true ];
optional bool cudnn_exhaustive_search = 21 [ default = true ];
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
......
......@@ -29,14 +29,20 @@ namespace framework {
namespace compatible {
struct OpUpdateRecord {
enum class Type { kInvalid = 0, kModifyAttr, kNewAttr };
enum class Type {
kInvalid = 0,
kModifyAttr,
kNewAttr,
kNewInput,
kNewOutput
};
Type type_;
std::string remark_;
};
struct ModifyAttr : OpUpdateRecord {
ModifyAttr(const std::string& name, const std::string& remark,
boost::any default_value)
const boost::any& default_value)
: OpUpdateRecord({Type::kModifyAttr, remark}),
name_(name),
default_value_(default_value) {
......@@ -47,9 +53,10 @@ struct ModifyAttr : OpUpdateRecord {
std::string name_;
boost::any default_value_;
};
struct NewAttr : OpUpdateRecord {
NewAttr(const std::string& name, const std::string& remark,
boost::any default_value)
const boost::any& default_value)
: OpUpdateRecord({Type::kNewAttr, remark}),
name_(name),
default_value_(default_value) {}
......@@ -59,6 +66,22 @@ struct NewAttr : OpUpdateRecord {
boost::any default_value_;
};
struct NewInput : OpUpdateRecord {
NewInput(const std::string& name, const std::string& remark)
: OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
private:
std::string name_;
};
struct NewOutput : OpUpdateRecord {
NewOutput(const std::string& name, const std::string& remark)
: OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
private:
std::string name_;
};
class OpVersionDesc {
public:
OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
......@@ -75,6 +98,18 @@ class OpVersionDesc {
return *this;
}
OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewInput(name, remark)));
return *this;
}
OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewOutput(name, remark)));
return *this;
}
private:
std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
};
......
......@@ -42,7 +42,14 @@ TEST(test_operator_version, test_operator_version) {
"height",
"In order to represent a two-dimensional rectangle, the "
"parameter height is added.",
0));
0))
.AddCheckpoint(
R"ROC(
Add a input [X2] and a output [Y2]
)ROC",
framework::compatible::OpVersionDesc()
.NewInput("X2", "The second input.")
.NewOutput("Y2", "The second output."));
}
} // namespace compatible
} // namespace framework
......
......@@ -64,10 +64,9 @@ if (NOT APPLE AND NOT WIN32)
SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_fluid_shared
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
elseif(NOT WIN32)
# TODO: Fix this unittest failed on Windows
inference_analysis_test(test_analyzer
SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_inference_api
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
elseif(WIN32)
inference_analysis_test(test_analyzer
SRCS analyzer_tester.cc
EXTRA_DEPS reset_tensor_array paddle_inference_api
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
endif()
......@@ -54,8 +54,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
elseif(NOT WIN32)
# TODO: Fix this unittest failed on Windows
elseif(WIN32)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
......@@ -67,8 +66,7 @@ endif()
if (NOT APPLE AND NOT WIN32)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_fluid_shared
ARGS --dirname=${WORD2VEC_MODEL_DIR})
elseif (NOT WIN32)
# TODO: Fix this unittest failed on Windows
elseif (WIN32)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
......@@ -132,6 +132,7 @@ if(NOT APPLE AND WITH_MKLML)
set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
......@@ -191,6 +192,7 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150)
# text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
......
......@@ -32,19 +32,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
URL ${URL}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_EXTRACT 1
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
${CMAKE_COMMAND} -E tar xzf ${FILENAME}
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/affine_grid_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
......@@ -84,14 +86,14 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
int theta_offset = n * 6; // 2 * 3;
T out_grad_x = out_grad[index * 2];
atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
atomicAdd(theta_grad + theta_offset + 2, out_grad_x);
platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
T out_grad_y = out_grad[index * 2 + 1];
atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
atomicAdd(theta_grad + theta_offset + 5, out_grad_y);
platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
}
}
......
......@@ -31,7 +31,7 @@ static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
int sW, int H, int W,
T delta) {
if (in_bounds(h, w, H, W)) {
atomicAdd(data + h * sH + w * sW, delta);
platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
}
}
......
......@@ -111,7 +111,8 @@ static void CallPythonFunc(py::object *callable,
out->set_lod(py_out_tensor->lod());
out->ShareDataWith(*py_out_tensor);
} catch (py::cast_error &) {
PADDLE_THROW("The %d-th output must be LoDTensor", i);
PADDLE_THROW(platform::errors::InvalidArgument(
"The %d-th output must be LoDTensor.", i));
}
}
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdio.h>
#include <cstdio>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
// set cub base traits in order to handle float16
namespace cub {
template <>
struct NumericTraits<paddle::platform::float16>
: BaseTraits<FLOATING_POINT, true, false, uint16_t,
paddle::platform::float16> {};
} // namespace cub
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
struct SegmentOffsetIter {
EIGEN_DEVICE_FUNC
explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
return idx * num_cols_;
}
int num_cols_;
};
// Iter using into a column
struct ColumnIndexIter {
explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
const Eigen::array<int, 1>& ix) const {
return ix[0] % num_cols_;
}
int num_cols_;
};
inline static int GetDesiredBlockDim(int dim) {
if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
template <typename T>
__global__ void InitIndex(T* indices, T num_rows, T num_cols) {
int col_id = threadIdx.x;
int row_id = blockIdx.x;
for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
indices[j * num_cols + i] = i;
}
}
}
template <typename T>
struct Pair {
__device__ __forceinline__ Pair() {}
__device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
__device__ __forceinline__ void set(T value, int64_t id) {
v = value;
id = id;
}
__device__ __forceinline__ void operator=(const Pair<T>& in) {
v = in.v;
id = in.id;
}
__device__ __forceinline__ bool operator<(const T value) const {
return (v < value);
}
__device__ __forceinline__ bool operator>(const T value) const {
return (v > value);
}
__device__ __forceinline__ bool operator<(const Pair<T>& in) const {
return (v < in.v) || ((v == in.v) && (id > in.id));
}
__device__ __forceinline__ bool operator>(const Pair<T>& in) const {
return (v > in.v) || ((v == in.v) && (id < in.id));
}
T v;
int64_t id;
};
template <typename T>
__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
int beam_size, const bool& largest) {
for (int k = beam_size - 2; k >= 0; k--) {
if (largest) {
if (topk[k] < p) {
topk[k + 1] = topk[k];
} else {
topk[k + 1] = p;
return;
}
} else {
if (topk[k] > p) {
topk[k + 1] = topk[k];
} else {
topk[k + 1] = p;
return;
}
}
}
topk[0] = p;
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
int dim, int beam_size,
const bool& largest) {
while (idx < dim) {
if (largest) {
if (topk[beam_size - 1] < src[idx]) {
Pair<T> tmp(src[idx], idx);
AddTo<T>(topk, tmp, beam_size, largest);
}
} else {
if (topk[beam_size - 1] > src[idx]) {
Pair<T> tmp(src[idx], idx);
AddTo<T>(topk, tmp, beam_size, largest);
}
}
idx += BlockSize;
}
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
int dim, const Pair<T>& max,
int beam_size, const bool& largest) {
while (idx < dim) {
if (largest) {
if (topk[beam_size - 1] < src[idx]) {
Pair<T> tmp(src[idx], idx);
if (tmp < max) {
AddTo<T>(topk, tmp, beam_size, largest);
}
}
} else {
if (topk[beam_size - 1] > src[idx]) {
Pair<T> tmp(src[idx], idx);
if (tmp > max) {
AddTo<T>(topk, tmp, beam_size, largest);
}
}
}
idx += BlockSize;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src,
bool* firstStep, bool* is_empty,
Pair<T>* max, int dim,
const int tid, bool largest) {
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length, largest);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-static_cast<T>(INFINITY), -1);
}
}
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length, largest);
}
}
*max = topk[MaxLength - 1];
if ((*max).v == -static_cast<T>(1)) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal,
int64_t** topIds, int* beam, int* k,
const int tid, const int warp,
const bool& largest) {
while (true) {
__syncthreads();
if (tid < BlockSize / 2) {
if (largest) {
if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
maxid[tid] = tid + BlockSize / 2;
} else {
maxid[tid] = tid;
}
} else {
if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) {
maxid[tid] = tid + BlockSize / 2;
} else {
maxid[tid] = tid;
}
}
}
__syncthreads();
for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) {
if (largest) {
if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
maxid[tid] = maxid[tid + stride];
}
} else {
if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) {
maxid[tid] = maxid[tid + stride];
}
}
}
__syncthreads();
}
__syncthreads();
if (tid == 0) {
**topVal = sh_topk[maxid[0]].v;
**topIds = sh_topk[maxid[0]].id;
(*topVal)++;
(*topIds)++;
}
if (tid == maxid[0]) (*beam)++;
if (--(*k) == 0) break;
__syncthreads();
if (tid == maxid[0]) {
if (*beam < MaxLength) {
sh_topk[tid] = topk[*beam];
}
}
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
if (maxid[0] / 32 == warp) {
if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
MaxLength)
break;
}
}
}
/**
* Each block compute one sample.
* In a block:
* 1. every thread get top MaxLength value;
* 2. merge to sh_topk, block reduce and get max value;
* 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value.
*/
template <typename T, int MaxLength, int BlockSize>
__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
const T* src, int lds, int dim, int k,
int grid_dim, int num, bool largest = true) {
__shared__ Pair<T> sh_topk[BlockSize];
const int tid = threadIdx.x;
const int warp = threadIdx.x / 32;
const int bid = blockIdx.x;
for (int i = bid; i < num; i += grid_dim) {
int top_num = k;
__shared__ int maxid[BlockSize / 2];
T* out = output + i * output_stride;
int64_t* inds = indices + i * k;
Pair<T> topk[MaxLength];
int beam = MaxLength;
Pair<T> max;
bool is_empty = false;
bool firststep = true;
for (int j = 0; j < MaxLength; j++) {
if (largest) {
topk[j].set(-static_cast<T>(INFINITY), -1);
} else {
topk[j].set(static_cast<T>(INFINITY), -1);
}
}
while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, src + i * lds,
&firststep, &is_empty, &max, dim,
tid, largest);
sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
&beam, &top_num, tid, warp, largest);
}
}
}
template <typename T, int MaxLength, int BlockSize>
__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
size_t rows, size_t cols, size_t k) {
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
x_grad[i * cols + j] = 0;
}
for (size_t j = 0; j < k; ++j) {
size_t idx = indices[i * k + j];
x_grad[i * cols + idx] = out_grad[i * k + j];
}
}
}
// the grad assign with the axis
template <typename T>
__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices,
T* grad_in, int pre, int post,
int raw_height, int k) {
// raw_height is the length of topk axis
for (int i = blockIdx.x; i < pre; i += gridDim.x) {
const int& base_index = i * post * k;
const int& base_grad = i * post * raw_height;
for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
grad_in[base_grad + j] = static_cast<T>(0);
}
for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
const int64_t idx_ij = indices[base_index + j];
const int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
grad_in[in_ij] = grad_out[idx_ij];
}
}
}
// use the radix sort for the topk
template <typename T>
bool SortTopk(const platform::CUDADeviceContext& ctx,
const framework::Tensor* input_tensor, const int64_t num_cols,
const int64_t num_rows, const int k,
framework::Tensor* out_tensor, framework::Tensor* indices_tensor,
bool largest = true) {
auto cu_stream = ctx.stream();
Tensor input_indices;
const std::vector<int64_t> dims = {num_rows, num_cols};
auto dim = framework::make_ddim(dims);
input_indices.Resize(dim);
// input_indices.Resize(num_rows*num_cols);
input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1;
auto ComputeBlockSize = [](int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
};
int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
// actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
// Init a index array
InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
input_indices.data<int64_t>(), num_rows, num_cols);
// create iter for counting input
cub::CountingInputIterator<int64_t> counting_iter(0);
// segment_offset is used for move to next row
cub::TransformInputIterator<int64_t, SegmentOffsetIter,
cub::CountingInputIterator<int64_t>>
segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
T* sorted_values_ptr;
int64_t* sorted_indices_ptr;
Tensor temp_values;
Tensor temp_indices;
const T* input = input_tensor->data<T>();
T* values = out_tensor->data<T>();
int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
if (k == num_cols) {
// Doing a full sort.
sorted_values_ptr = values;
sorted_indices_ptr = indices;
} else {
temp_values.Resize(dim);
temp_indices.Resize(dim);
sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
}
// Get temp storage buffer size, maybe can allocate a fixed buffer to save
// time.
if (largest) {
auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
nullptr, temp_storage_bytes, input, sorted_values_ptr,
input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
if (err != cudaSuccess) {
LOG(ERROR)
<< "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
"temp_storage_bytes, status: "
<< cudaGetErrorString(err);
return false;
}
} else {
auto err = cub::DeviceSegmentedRadixSort::SortPairs(
nullptr, temp_storage_bytes, input, sorted_values_ptr,
input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
if (err != cudaSuccess) {
LOG(ERROR) << "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to calculate "
"temp_storage_bytes, status: "
<< cudaGetErrorString(err);
return false;
}
}
Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
if (largest) {
auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
temp_storage.data<uint8_t>(), temp_storage_bytes, input,
sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
0, sizeof(T) * 8, cu_stream);
if (err != cudaSuccess) {
LOG(ERROR) << "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to "
"sort input, "
"temp_storage_bytes: "
<< temp_storage_bytes
<< ", status: " << cudaGetErrorString(err);
return false;
}
} else {
auto err = cub::DeviceSegmentedRadixSort::SortPairs(
temp_storage.data<uint8_t>(), temp_storage_bytes, input,
sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
0, sizeof(T) * 8, cu_stream);
if (err != cudaSuccess) {
LOG(ERROR) << "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to "
"sort input, "
"temp_storage_bytes: "
<< temp_storage_bytes
<< ", status: " << cudaGetErrorString(err);
return false;
}
}
auto& dev = *ctx.eigen_device();
if (k < num_cols) {
// copy sliced data to output.
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
auto dim = framework::make_ddim(odims);
auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = EigenMatrix<T>::From(temp_values);
e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
}
return true;
}
} // namespace operators
} // namespace paddle
......@@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdio>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
// set cub base traits in order to handle float16
namespace cub {
template <>
struct NumericTraits<paddle::platform::float16>
: BaseTraits<FLOATING_POINT, true, false, uint16_t,
paddle::platform::float16> {};
} // namespace cub
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
struct Pair {
__device__ __forceinline__ Pair() {}
__device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
__device__ __forceinline__ void set(T value, int64_t id) {
v = value;
id = id;
}
__device__ __forceinline__ void operator=(const Pair<T>& in) {
v = in.v;
id = in.id;
}
__device__ __forceinline__ bool operator<(const T value) const {
return (v < value);
}
__device__ __forceinline__ bool operator<(const Pair<T>& in) const {
return (v < in.v) || ((v == in.v) && (id > in.id));
}
__device__ __forceinline__ bool operator>(const Pair<T>& in) const {
return (v > in.v) || ((v == in.v) && (id < in.id));
}
T v;
int64_t id;
};
template <typename T>
__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
int beam_size) {
for (int k = beam_size - 2; k >= 0; k--) {
if (topk[k] < p) {
topk[k + 1] = topk[k];
} else {
topk[k + 1] = p;
return;
}
}
topk[0] = p;
}
template <typename T, int beam_size>
__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
for (int k = beam_size - 2; k >= 0; k--) {
if (topk[k] < p) {
topk[k + 1] = topk[k];
} else {
topk[k + 1] = p;
return;
}
}
topk[0] = p;
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
int dim, int beam_size) {
while (idx < dim) {
if (topk[beam_size - 1] < src[idx]) {
Pair<T> tmp(src[idx], idx);
AddTo<T>(topk, tmp, beam_size);
}
idx += BlockSize;
}
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
int dim, const Pair<T>& max,
int beam_size) {
while (idx < dim) {
if (topk[beam_size - 1] < src[idx]) {
Pair<T> tmp(src[idx], idx);
if (tmp < max) {
AddTo<T>(topk, tmp, beam_size);
}
}
idx += BlockSize;
}
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
int idx, int dim, int beam_size) {
while (idx < dim) {
if (topk[beam_size - 1] < val[idx]) {
Pair<T> tmp(val[idx], col[idx]);
AddTo<T>(topk, tmp, beam_size);
}
idx += BlockSize;
}
}
template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
int idx, int dim, const Pair<T>& max,
int beam_size) {
while (idx < dim) {
if (topk[beam_size - 1] < val[idx]) {
Pair<T> tmp(val[idx], col[idx]);
if (tmp < max) {
AddTo<T>(topk, tmp, beam_size);
}
}
idx += BlockSize;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src,
bool* firstStep, bool* is_empty,
Pair<T>* max, int dim,
const int tid) {
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-static_cast<T>(INFINITY), -1);
}
}
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length);
}
}
*max = topk[MaxLength - 1];
if ((*max).v == -static_cast<T>(1)) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val,
int* col, bool* firstStep,
bool* is_empty, Pair<T>* max,
int dim, const int tid) {
if (*beam > 0) {
int length = (*beam) < beam_size ? *beam : beam_size;
if (*firstStep) {
*firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else {
for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam];
} else {
topk[k].set(-static_cast<T>(INFINITY), -1);
}
}
if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length);
}
}
*max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true;
*beam = 0;
}
}
template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal,
int64_t** topIds, int* beam, int* k,
const int tid, const int warp) {
while (true) {
__syncthreads();
if (tid < BlockSize / 2) {
if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
maxid[tid] = tid + BlockSize / 2;
} else {
maxid[tid] = tid;
}
}
__syncthreads();
for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
if (tid < stride) {
if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
maxid[tid] = maxid[tid + stride];
}
}
__syncthreads();
}
__syncthreads();
if (tid == 0) {
**topVal = sh_topk[maxid[0]].v;
**topIds = sh_topk[maxid[0]].id;
(*topVal)++;
(*topIds)++;
}
if (tid == maxid[0]) (*beam)++;
if (--(*k) == 0) break;
__syncthreads();
if (tid == maxid[0]) {
if (*beam < MaxLength) {
sh_topk[tid] = topk[*beam];
}
}
// NOTE(zcd): temporary solution
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
if (maxid[0] / 32 == warp) {
if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
MaxLength)
break;
}
}
}
/**
* Each block compute one sample.
* In a block:
* 1. every thread get top MaxLength value;
* 2. merge to sh_topk, block reduce and get max value;
* 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value.
*/
template <typename T, int MaxLength, int BlockSize>
__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
const T* src, int lds, int dim, int k,
int grid_dim, int num) {
__shared__ Pair<T> sh_topk[BlockSize];
const int tid = threadIdx.x;
const int warp = threadIdx.x / 32;
const int bid = blockIdx.x;
for (int i = bid; i < num; i += grid_dim) {
int top_num = k;
__shared__ int maxid[BlockSize / 2];
T* out = output + i * output_stride;
int64_t* inds = indices + i * k;
Pair<T> topk[MaxLength];
int beam = MaxLength;
Pair<T> max;
bool is_empty = false;
bool firststep = true;
for (int j = 0; j < MaxLength; j++) {
topk[j].set(-static_cast<T>(INFINITY), -1);
}
while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>(
topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
&beam, &top_num, tid, warp);
}
}
}
template <typename T, int MaxLength, int BlockSize>
__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad,
size_t rows, size_t cols, size_t k) {
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
x_grad[i * cols + j] = 0;
}
for (size_t j = 0; j < k; ++j) {
size_t idx = indices[i * k + j];
x_grad[i * cols + idx] = out_grad[i * k + j];
}
}
}
inline static int GetDesiredBlockDim(int dim) {
if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
// Iter for move to next row
struct SegmentOffsetIter {
EIGEN_DEVICE_FUNC
explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
return idx * num_cols_;
}
int num_cols_;
};
// Iter using into a column
struct ColumnIndexIter {
explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
const Eigen::array<int, 1>& ix) const {
return ix[0] % num_cols_;
}
int num_cols_;
};
__global__ void InitIndex(int64_t* indices, int64_t num_rows,
int64_t num_cols) {
int col_id = threadIdx.x;
int row_id = blockIdx.x;
for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
indices[j * num_cols + i] = i;
}
}
}
template <typename T>
bool SortTopk(const platform::CUDADeviceContext& ctx,
const framework::Tensor* input_tensor, const int64_t num_cols,
const int64_t num_rows, const int k,
framework::Tensor* out_tensor,
framework::Tensor* indices_tensor) {
auto cu_stream = ctx.stream();
Tensor input_indices;
const std::vector<int64_t> dims = {num_rows, num_cols};
auto dim = framework::make_ddim(dims);
input_indices.Resize(dim);
// input_indices.Resize(num_rows*num_cols);
input_indices.mutable_data<int64_t>(ctx.GetPlace());
size_t temp_storage_bytes = -1;
auto ComputeBlockSize = [](int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
};
int block_size = ComputeBlockSize(num_cols);
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
// actually, int num_rows < max_grid_size
unsigned int grid_size = num_rows < maxGridDimX
? static_cast<unsigned int>(num_rows)
: maxGridDimX;
// Init a index array
InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
input_indices.data<int64_t>(), num_rows, num_cols);
// create iter for counting input
cub::CountingInputIterator<int64_t> counting_iter(0);
// segment_offset is used for move to next row
cub::TransformInputIterator<int64_t, SegmentOffsetIter,
cub::CountingInputIterator<int64_t>>
segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
T* sorted_values_ptr;
int64_t* sorted_indices_ptr;
Tensor temp_values;
Tensor temp_indices;
const T* input = input_tensor->data<T>();
T* values = out_tensor->data<T>();
int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
if (k == num_cols) {
// Doing a full sort.
sorted_values_ptr = values;
sorted_indices_ptr = indices;
} else {
temp_values.Resize(dim);
temp_indices.Resize(dim);
sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
}
// Get temp storage buffer size, maybe can allocate a fixed buffer to save
// time.
auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
nullptr, temp_storage_bytes, input, sorted_values_ptr,
input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
if (err != cudaSuccess) {
LOG(ERROR)
<< "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
"temp_storage_bytes, status: "
<< cudaGetErrorString(err);
return false;
}
Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
temp_storage.data<uint8_t>(), temp_storage_bytes, input,
sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
0, sizeof(T) * 8, cu_stream);
if (err != cudaSuccess) {
LOG(ERROR)
<< "TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes: "
<< temp_storage_bytes << ", status: " << cudaGetErrorString(err);
return false;
}
auto& dev = *ctx.eigen_device();
if (k < num_cols) {
// copy sliced data to output.
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
auto dim = framework::make_ddim(odims);
auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
auto e_tmp_values = EigenMatrix<T>::From(temp_values);
e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
}
return true;
}
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
......@@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
const int64_t input_width = inputdims[inputdims.size() - 1];
const auto& dev_ctx = ctx.cuda_device_context();
if ((input_width <= 1024 || k >= 128 || k == input_width)) {
if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
indices)) {
......@@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1));
const size_t col = xdims[xdims.size() - 1];
const auto& dev_ctx = context.cuda_device_context();
const int kMaxHeight = 2048;
int gridx = row < kMaxHeight ? row : kMaxHeight;
switch (GetDesiredBlockDim(col)) {
......@@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
top_k,
paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/top_k_v2_op.h"
#include <memory>
namespace paddle {
namespace operators {
class TopkV2Op : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of TopkOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of TopkOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Indices"),
"Output(Indices) of TopkOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
const int& dim_size = input_dims.size();
const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
"the axis of topk"
"must be [-%d, %d), but you set axis is %d",
dim_size, dim_size, axis);
if (axis < 0) axis += dim_size;
PADDLE_ENFORCE_GE(
k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
PADDLE_ENFORCE_GE(input_dims.size(), 1,
"input of topk must have >= 1d shape");
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(
input_dims[axis], k,
"input of topk op must have >= %d columns in axis of %d", k, axis);
}
framework::DDim dims = input_dims;
dims[axis] = k;
ctx->SetOutputDim("Out", dims);
ctx->SetOutputDim("Indices", dims);
ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
layout_, library_);
}
};
class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) The input of Topk op");
AddInput("K",
"(Tensor) Number of top elements to look for along "
"the last dimension (along each row for matrices).")
.AsDispensable();
AddOutput("Out", "(Tensor) The output tensor of Topk op");
AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
AddComment(R"DOC(
Top K operator
If the input is a vector (1d tensor), this operator finds the k largest
entries in the vector and outputs their values and indices as vectors.
Thus values[j] is the j-th largest entry in input, and its index is indices[j].
For matrices, this operator computes the top k entries in each row. )DOC");
AddAttr<int>("k",
"(int, default 1) Number of top elements to look for along "
"the tensor).")
.SetDefault(1);
AddAttr<int>("axis",
"the axis to sort and get the k indices, value."
"if not set, will get k value in last axis.")
.SetDefault(-1);
AddAttr<bool>("largest",
"control flag whether to return largest or smallest")
.SetDefault(true);
AddAttr<bool>("sorted",
"control flag whether to return elements in sorted order")
.SetDefault(true);
}
};
class TopkV2OpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(
ctx->HasInput("X"), true,
platform::errors::InvalidArgument("Input(X) should be not null"));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Indices"), true,
platform::errors::InvalidArgument("Input(Indices) should be not null"));
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
platform::errors::InvalidArgument(
"Grad Input(Out) should be not null"));
PADDLE_ENFORCE_EQ(
ctx->HasOutput(framework::GradVarName("X")), true,
platform::errors::InvalidArgument("Grad Output(X) should be not null"));
auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
template <typename T>
class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("top_k_v2_grad");
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetInput("X", this->Input("X"));
op->SetInput("Indices", this->Output("Indices"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetAttrMap(this->Attrs());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
REGISTER_OP_CPU_KERNEL(top_k_v2,
ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
REGISTER_OP_CPU_KERNEL(
top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/p_norm_op.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/fluid/operators/top_k_v2_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template <typename DeviceContext, typename T>
class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");
// get the attributes
int k = static_cast<int>(ctx.Attr<int>("k"));
int axis = static_cast<int>(ctx.Attr<int>("axis"));
const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
// get the input dims
const auto& in_dims = input->dims();
// calcluate the real axis
if (axis < 0) axis += in_dims.size();
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
Tensor k_host;
framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
k = k_host.data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[axis] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
const auto& out_dims = output->dims();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
if (axis == in_dims.size() - 1) {
// if get the topK from the last axis
const int64_t& input_height = framework::product(
framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
const auto& dev_ctx = ctx.cuda_device_context();
if (k > input_width) k = input_width;
if ((input_width <= 1024 || k >= 128 || k == input_width)) {
if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
indices, largest)) {
// Successed, return.
return;
} else {
LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
"default topk kernel.";
}
}
// NOTE: pass lds and dim same to input width.
// NOTE: old matrix implementation of stride is different to eigen.
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
switch (GetDesiredBlockDim(input_width)) {
FIXED_BLOCK_DIM(
KeMatrixTopK<T, 5,
kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
output_data, k, indices_data, input_data, input_width,
input_width, static_cast<int>(k), gridx, input_height,
largest));
default:
PADDLE_THROW(platform::errors::Fatal(
"the input data shape has error in the topk cuda kernel."));
}
} else {
// if get topK not from the last axis, will tranpose the tensor and get
// TopK
// first step, prepare the trans args for the tranpose
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
framework::DDim trans_dims(in_dims);
framework::DDim trans_out_dims(output->dims());
for (int i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
trans_out_dims[i] = out_dims[trans[i]];
}
// second step, tranpose the input
Tensor trans_input;
trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
int ndims = trans.size();
const auto& dev_ctx = ctx.cuda_device_context();
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
&trans_input, trans);
// third step, calcluate the topk
// allocate the tmp cuda memory for the tmp result
Tensor trans_ind;
trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
Tensor trans_out;
trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
const int64_t input_height = framework::product(
framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
if (k > input_width) k = input_width;
if ((input_width <= 1024 || k >= 128 || k == input_width)) {
if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
&trans_out, &trans_ind, largest)) {
// last step, tranpose back the indices and output
TransCompute<platform::CUDADeviceContext, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans);
TransCompute<platform::CUDADeviceContext, T>(
ndims, dev_ctx, trans_out, output, trans);
return;
} else {
LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
"default topk kernel.";
}
}
const int kMaxHeight = 2048;
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
switch (GetDesiredBlockDim(input_width)) {
FIXED_BLOCK_DIM(
KeMatrixTopK<T, 5,
kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
trans_out.data<T>(), k, trans_ind.data<int64_t>(),
trans_input.data<T>(), input_width, input_width,
static_cast<int>(k), gridx, input_height, largest));
default:
PADDLE_THROW(platform::errors::Fatal(
"the input data shape has error in the topk cuda kernel."));
}
// last step, tranpose back the indices and output
TransCompute<platform::CUDADeviceContext, int64_t>(
ndims, dev_ctx, trans_ind, indices, trans);
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
output, trans);
}
}
};
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
template <typename DeviceContext, typename T>
class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(context.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
auto* x = context.Input<Tensor>("X");
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<Tensor>("Indices");
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
int axis = context.Attr<int>("axis");
const auto& in_dims = x->dims();
const auto& out_dims = indices->dims();
// get the real the axis and the k
if (axis < 0) axis += in_dims.size();
const int& k = out_dims[axis];
const int& raw_height = in_dims[axis];
// allocate the cuda memory for the x_grad
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const int64_t* indices_data = indices->data<int64_t>();
int pre, n, post;
GetDims(in_dims, axis, &pre, &n, &post);
// calcluate the block and grid num
auto& dev_ctx = context.cuda_device_context();
auto ComputeBlockSize = [](int col) {
if (col > 512)
return 1024;
else if (col > 256 && col <= 512)
return 512;
else if (col > 128 && col <= 256)
return 256;
else if (col > 64 && col <= 128)
return 128;
else
return 64;
};
int block_size = ComputeBlockSize(post * k);
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
int grid_size = std::min(max_blocks, pre);
// lanuch the cuda kernel to assign the grad
AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
out_grad_data, indices_data, x_grad_data, pre, post, n, k);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
top_k_v2,
paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
float>,
paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
int>,
paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
paddle::platform::CUDADeviceContext, float>,
paddle::operators::TopkV2OpGradCUDAKernel<
paddle::platform::CUDADeviceContext, double>,
paddle::operators::TopkV2OpGradCUDAKernel<
paddle::platform::CUDADeviceContext, int>,
paddle::operators::TopkV2OpGradCUDAKernel<
paddle::platform::CUDADeviceContext, int64_t>,
paddle::operators::TopkV2OpGradCUDAKernel<
paddle::platform::CUDADeviceContext, paddle::platform::float16>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
The reason why we need the topk v2 is because the compatibility. We redefine
the NaN is maximum value
in the process of comparing. If do not add the topk v2, will affect the
inference result of model that traing
by the older version paddlepaddle.
*/
#pragma once
#include <algorithm>
#include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/operators/transpose_op.h"
namespace paddle {
namespace operators {
template <typename T, typename Type>
static void FullTopK(Type input_height, Type input_width, int input_dim,
const framework::Tensor* input, T* t_out, Type* t_indices,
const int& k, const bool& largest, const bool& sorted) {
// when the k is small, will the partial sort
bool partial_sort_flag = (k * 64) < input_width;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
// Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
for (Type i = 0; i < input_height; ++i) {
std::vector<std::pair<T, Type>> col_vec;
col_vec.reserve(input_width);
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
for (Type j = 0; j < input_width; ++j) {
col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
}
}
if (partial_sort_flag) {
std::partial_sort(
col_vec.begin(), col_vec.begin() + k, col_vec.end(),
[&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
if (largest) {
return (std::isnan(static_cast<double>(l.first)) &&
!std::isnan(static_cast<double>(r.first))) ||
(l.first > r.first);
} else {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
}
});
} else {
// use the nth-element to get the K-larger or K-small element
if (largest) {
std::nth_element(
col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (std::isnan(static_cast<double>(l.first)) &&
!std::isnan(static_cast<double>(r.first))) ||
(l.first > r.first);
});
// the nth-element will get the unorder elements, sort the element
if (sorted) {
std::sort(col_vec.begin(), col_vec.begin() + k - 1,
[&largest](const std::pair<T, Type>& l,
const std::pair<T, Type>& r) {
return (std::isnan(static_cast<double>(l.first)) &&
!std::isnan(static_cast<double>(r.first))) ||
(l.first > r.first);
});
}
} else {
std::nth_element(
col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
// the nth-element will get the unorder elements, sort the element
if (sorted) {
std::sort(
col_vec.begin(), col_vec.begin() + k - 1,
[](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
return (!std::isnan(static_cast<double>(l.first)) &&
std::isnan(static_cast<double>(r.first))) ||
(l.first < r.first);
});
}
}
}
for (Type j = 0; j < k; ++j) {
t_out[i * k + j] = col_vec[j].first;
t_indices[i * k + j] = col_vec[j].second;
}
}
}
template <typename T, typename Type>
static void FullTopKAssign(const Type& input_height, const Type& input_width,
const int& input_dim, const framework::Tensor* input,
const framework::Tensor* indices, T* output_data,
const int& k) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (Type i = 0; i < input_height; ++i) {
if (input_dim == 1) {
auto e_input = EigenVector<T>::Flatten(*input);
auto e_indices = EigenVector<Type>::Flatten(*indices);
for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(j)] = e_input(j);
}
} else {
auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
for (Type j = 0; j < k; ++j) {
output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
}
}
}
}
template <typename DeviceContext, typename T>
class TopkV2Kernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// Get the top k elements of each row of input tensor
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
auto* indices = context.Output<Tensor>("Indices");
const auto& in_dims = input->dims();
int k = static_cast<int>(context.Attr<int>("k"));
const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
// axis < 0, cacluate the real axis
int axis = static_cast<int>(context.Attr<int>("axis"));
if (axis < 0) axis += in_dims.size();
// if K tensor is not null, will the use K tesnor as k
auto* k_t = context.Input<Tensor>("K");
if (k_t) {
k = k_t->data<int>()[0];
framework::DDim output_dims = output->dims();
// accroding to axis to set K value in the dim
output_dims[axis] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
T* output_data = output->mutable_data<T>(context.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
const auto& out_dims = output->dims();
if (axis + 1 == in_dims.size()) {
const int64_t& input_height = framework::product(
framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t& input_width = in_dims[in_dims.size() - 1];
FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
output_data, indices_data, k, largest, sorted);
} else {
// if the topk dims is not last dim, will tranpose and do topk
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.push_back(in_dims.size() - 1);
for (int i = axis + 1; i < in_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
// get the trans input_dims, out_dims
framework::DDim trans_dims(in_dims);
framework::DDim trans_out_dims(output->dims());
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = in_dims[trans[i]];
}
for (size_t i = 0; i < trans.size(); i++) {
trans_out_dims[i] = out_dims[trans[i]];
}
Tensor trans_inp;
trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
int ndims = trans.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
// transpose the input value
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
&trans_inp, trans);
const int64_t input_height = framework::product(
framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
const int64_t input_width = trans_dims[trans_dims.size() - 1];
// Allocate the temp tensor to the save the topk indices, values
Tensor tmp_out;
T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
Tensor tmp_indices;
auto* t_ind =
tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
// get the TopK value
FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_inp, t_out, t_ind, k, largest, sorted);
// transpose back
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, tmp_indices, indices, trans);
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
output, trans);
}
}
};
template <typename DeviceContext, typename T>
class TopkV2GradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* indices = context.Input<Tensor>("Indices");
auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
int axis = static_cast<int>(context.Attr<int>("axis"));
const auto& in_dims = x->dims();
const auto& out_dims = indices->dims();
// axis < 0, get the real axis
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
const size_t& k = out_dims[axis];
T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
if (axis + 1 == in_dims.size()) {
// allocate the memory for the input_grad
// assign the out_grad to input_grad directly
const int64_t input_height = framework::product(
framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
const int64_t input_width = in_dims[in_dims.size() - 1];
// init the output grad with 0, because some input elements has no grad
memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
// Assign the output_grad to input_grad
FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
indices, x_grad_data, k);
} else {
// can not assign grad to input_grad, must do the transpose
std::vector<int> trans;
for (int i = 0; i < axis; i++) {
trans.emplace_back(i);
}
trans.emplace_back(out_dims.size() - 1);
for (int i = axis + 1; i < out_dims.size() - 1; i++) {
trans.emplace_back(i);
}
trans.emplace_back(axis);
framework::DDim trans_dims(out_dims);
framework::DDim trans_in_dims(in_dims);
for (size_t i = 0; i < trans.size(); i++) {
trans_dims[i] = out_dims[trans[i]];
trans_in_dims[i] = in_dims[trans[i]];
}
// transpose the out_grad, indices
Tensor trans_dO;
trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
Tensor trans_ind;
trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
int ndims = trans.size();
auto& dev_context =
context.template device_context<platform::CPUDeviceContext>();
// Do transpose
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
&trans_dO, trans);
TransCompute<platform::CPUDeviceContext, int64_t>(
ndims, dev_context, *indices, &trans_ind, trans);
const int64_t input_height = framework::product(
framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
// Assign the out_grad to tranpose input_grad
Tensor tmp_out;
T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
memset(t_out, 0, x_grad->numel() * sizeof(T));
FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
&trans_dO, &trans_ind, t_out, k);
// Transpose back
TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
x_grad, trans);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -24,17 +24,63 @@ class UniqueOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
auto in_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
in_dims.size(), 1,
platform::errors::InvalidArgument("The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d.",
in_dims.size()));
if (!ctx->Attrs().Get<bool>("is_sorted")) {
OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
PADDLE_ENFORCE_EQ(in_dims.size(), 1,
platform::errors::InvalidArgument(
"The Input(X) should be 1-D Tensor, "
"But now the dims of Input(X) is %d.",
in_dims.size()));
ctx->SetOutputDim("Out", {-1});
ctx->SetOutputDim("Index", in_dims);
return;
}
bool return_index = ctx->Attrs().Get<bool>("return_index");
bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
bool return_counts = ctx->Attrs().Get<bool>("return_counts");
auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
if (return_index) {
OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
}
if (return_inverse) {
OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
}
if (return_counts) {
OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
}
ctx->SetOutputDim("Out", {-1});
ctx->SetOutputDim("Index", in_dims);
if (axis_vec.empty()) {
ctx->SetOutputDim("Out", {-1});
if (return_inverse) {
ctx->SetOutputDim("Index", {framework::product(in_dims)});
}
} else {
int axis = axis_vec[0];
if (axis < 0) {
axis += in_dims.size();
}
PADDLE_ENFORCE_LT(
axis, in_dims.size(),
platform::errors::InvalidArgument("The axis(%d) should be less than "
"the dimension size(%d) of x.",
axis, in_dims.size()));
auto out_dims = in_dims;
out_dims[axis] = -1;
ctx->SetOutputDim("Out", out_dims);
if (return_inverse) {
ctx->SetOutputDim("Index", {in_dims[axis]});
}
}
if (return_index) {
ctx->SetOutputDim("Indices", {-1});
}
if (return_counts) {
ctx->SetOutputDim("Counts", {-1});
}
}
protected:
......@@ -49,14 +95,47 @@ class UniqueOp : public framework::OperatorWithKernel {
class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input tensor. It should be a 1-D tensor.");
AddInput("X",
"Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
" is fasle or a N-D tensor when Attr(is_sorted) is true.");
AddAttr<int>("dtype", "data type for output index");
AddOutput("Out", "A unique subsequence for input tensor.");
AddOutput("Index",
"An index tensor pointing to unique subsequence, which has "
"identical shape with input tensor and int64 dtype.");
"Equivalent to inverse in numpy.unique, "
"the indices for where elements in the original input ended up "
"in the returned unique tensor.");
AddOutput(
"Indices",
"The indices of the input tensor that result in the unique tensor.")
.AsDispensable();
AddOutput("Counts", "The counts for each unique element.").AsDispensable();
AddAttr<bool>("return_index",
"If True, also return the indices of the input"
" tensor that result in the unique Tensor.")
.SetDefault(false);
AddAttr<bool>(
"return_inverse",
"If True, also return the indices for where elements"
" in the original input ended up in the returned unique tensor.")
.SetDefault(false);
AddAttr<bool>("return_counts",
"If True, also return the counts for each unique element.")
.SetDefault(false);
AddAttr<std::vector<int>>(
"axis",
"The axis to apply unique. If None, the input will be flattened.")
.SetDefault({});
AddAttr<bool>("is_sorted",
"If True, the unique elements of X are in ascending order."
"Otherwise, the unique elements are not sorted.")
.SetDefault(false);
AddComment(R"DOC(
Return a unique subsequence for 1-D input tensor, and an index tensor pointing to this unique subsequence
1. Return a unique subsequence for 1-D input tensor, and an index tensor
pointing to this unique subsequence when Attr(is_sorted) is false. This
means paddle.unique is called.
2. Returns the unique elements of X in ascending order when Attr(is_sorted)
is true. This means fluid.layers.unique is called.
)DOC");
}
};
......@@ -65,6 +144,8 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
REGISTER_OP_CPU_KERNEL(unique, ops::UniqueKernel<float>,
ops::UniqueKernel<double>, ops::UniqueKernel<int32_t>,
ops::UniqueKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(
unique, ops::UniqueKernel<paddle::platform::CPUDeviceContext, float>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, double>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::UniqueKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/transpose_op.h"
namespace paddle {
namespace operators {
......@@ -104,17 +109,243 @@ struct UniqueOpFunctor {
}
};
static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
int64_t size = in.dims()[0];
std::vector<framework::Tensor> tensors(size);
for (int64_t i = 0; i < size; ++i) {
tensors[i] = in.Slice(i, i + 1);
}
return tensors;
}
template <typename T>
static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
if (a.numel() != b.numel()) {
return false;
}
for (int64_t i = 0; i < a.numel(); ++i) {
if (a.data<T>()[i] != b.data<T>()[i]) {
return false;
}
}
return true;
}
template <typename T>
static void UniqueFlattendTensor(const framework::ExecutionContext& context,
const framework::Tensor& in,
framework::Tensor* out, bool return_index,
bool return_inverse, bool return_counts) {
const T* in_data = in.data<T>();
std::set<T> unique(in_data, in_data + in.numel());
out->Resize(framework::make_ddim({static_cast<int64_t>(unique.size())}));
auto out_data = out->mutable_data<T>(context.GetPlace());
std::copy(unique.begin(), unique.end(), out_data);
if (return_index) {
auto* indices = context.Output<framework::Tensor>("Indices");
indices->Resize(framework::make_ddim({out->numel()}));
auto indices_data = indices->mutable_data<int64_t>(context.GetPlace());
std::unordered_map<T, int64_t> indices_map;
indices_map.reserve(out->numel());
for (int64_t i = 0; i < in.numel(); ++i) {
if (indices_map.find(in_data[i]) != indices_map.end()) continue;
indices_map[in_data[i]] = i;
}
for (int64_t i = 0; i < out->numel(); ++i) {
indices_data[i] = indices_map[out_data[i]];
}
}
if (return_inverse) {
auto* inverse = context.Output<framework::Tensor>("Index");
inverse->Resize(framework::make_ddim({in.numel()}));
auto inverse_data = inverse->mutable_data<int64_t>(context.GetPlace());
std::unordered_map<T, int64_t> inverse_map;
inverse_map.reserve(out->numel());
for (int64_t i = 0; i < out->numel(); ++i) {
inverse_map[out_data[i]] = i;
}
for (int64_t i = 0; i < in.numel(); ++i) {
inverse_data[i] = inverse_map[in_data[i]];
}
}
if (return_counts) {
auto* count = context.Output<framework::Tensor>("Counts");
count->Resize(framework::make_ddim({out->numel()}));
auto count_data = count->mutable_data<int64_t>(context.GetPlace());
std::unordered_map<T, int64_t> counts_map;
counts_map.reserve(out->numel());
for (int64_t i = 0; i < out->numel(); ++i) {
counts_map[out_data[i]] = 0;
}
for (int64_t i = 0; i < in.numel(); i++) {
counts_map[in_data[i]] += 1;
}
for (int64_t i = 0; i < out->numel(); i++) {
count_data[i] = counts_map[out_data[i]];
}
}
}
template <class ForwardIt, typename T>
static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
ForwardIt first, ForwardIt last,
const std::vector<int64_t>& sorted_indices_vec,
std::vector<int64_t>* inverse_vec,
std::vector<int64_t>* counts_vec,
std::vector<int64_t>* indices_vec) {
if (first == last) {
return last;
}
(*inverse_vec)[sorted_indices_vec[0]] = 0;
(*counts_vec)[0] = 1;
(*indices_vec)[0] = sorted_indices_vec[0];
ForwardIt begin = first;
ForwardIt result = first;
while (++first != last) {
int64_t idx_first = std::distance(begin, first);
int64_t idx_result = std::distance(begin, result);
if (!Equal<T>(*result, *first)) {
if (++result != first) {
*result = std::move(*first);
}
idx_result += 1;
(*indices_vec)[idx_result] = sorted_indices_vec[idx_first];
}
(*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
(*counts_vec)[idx_result] += 1;
}
return ++result;
}
template <typename DeviceContext, typename T>
static void UniqueDim(const framework::ExecutionContext& context,
const framework::Tensor& in, framework::Tensor* out,
bool return_index, bool return_inverse,
bool return_counts, int axis) {
// transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
std::vector<int> permute(in.dims().size());
std::iota(permute.begin(), permute.end(), 0);
permute[axis] = 0;
permute[0] = axis;
std::vector<int64_t> in_trans_dims_vec(framework::vectorize(in.dims()));
in_trans_dims_vec[axis] = in.dims()[0];
in_trans_dims_vec[0] = in.dims()[axis];
framework::Tensor in_trans;
framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec);
in_trans.Resize(in_trans_dims);
in_trans.mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
TransCompute<DeviceContext, T>(in.dims().size(), dev_ctx, in, &in_trans,
permute);
// reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
framework::DDim in_trans_flat_dims =
framework::flatten_to_2d(in_trans_dims, 1);
in_trans.Resize(in_trans_flat_dims);
// sort indices
std::vector<int64_t> sorted_indices_vec(in_trans.dims()[0]);
std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
int64_t col = in_trans.dims()[1];
const T* in_trans_data = in_trans.data<T>();
std::sort(sorted_indices_vec.begin(), sorted_indices_vec.end(),
[&](int64_t a, int64_t b) -> bool {
for (int64_t i = 0; i < col; ++i) {
T lhs = in_trans_data[i + a * col];
T rhs = in_trans_data[i + b * col];
if (lhs < rhs) {
return true;
} else if (lhs > rhs) {
return false;
}
}
return false;
});
// sort tensor according to indices
framework::Tensor input_sorted;
input_sorted.Resize(in_trans_dims);
input_sorted.mutable_data<T>(context.GetPlace());
T* input_sorted_data = input_sorted.data<T>();
for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
memcpy(input_sorted_data + i * col,
in_trans_data + sorted_indices_vec[i] * col, col * sizeof(T));
}
std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
std::vector<int64_t> inverse_vec(sorted_indices_vec.size(), 0);
std::vector<int64_t> counts_vec(sorted_indices_vec.size(), 0);
std::vector<int64_t> indices_vec(sorted_indices_vec.size(), 0);
auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, T>(
context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec,
&inverse_vec, &counts_vec, &indices_vec);
input_unbind.erase(last, input_unbind.end());
counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
indices_vec.erase(indices_vec.begin() + input_unbind.size(),
indices_vec.end());
math::ConcatFunctor<DeviceContext, T> concat_functor;
framework::Tensor out_trans;
std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
out_trans_dims_vec[0] = input_unbind.size();
out_trans.Resize(framework::make_ddim(out_trans_dims_vec));
out_trans.mutable_data<T>(context.GetPlace());
std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
out->Resize(framework::make_ddim(out_trans_dims_vec));
out->mutable_data<T>(context.GetPlace());
concat_functor(dev_ctx, input_unbind, 0, &out_trans);
TransCompute<DeviceContext, T>(out_trans.dims().size(), dev_ctx, out_trans,
out, permute);
if (return_inverse) {
auto* inverse = context.Output<framework::Tensor>("Index");
framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
}
if (return_counts) {
auto* count = context.Output<framework::Tensor>("Counts");
framework::TensorFromVector(counts_vec, context.device_context(), count);
}
if (return_index) {
auto* indices = context.Output<framework::Tensor>("Indices");
framework::TensorFromVector(indices_vec, context.device_context(), indices);
}
}
template <typename DeviceContext, typename T>
class UniqueKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto data_type = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* index = context.Output<framework::Tensor>("Index");
if (!context.Attr<bool>("is_sorted")) {
auto data_type = static_cast<framework::proto::VarType::Type>(
context.Attr<int>("dtype"));
auto* index = context.Output<framework::Tensor>("Index");
framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
return;
}
framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
bool return_index = context.Attr<bool>("return_index");
bool return_inverse = context.Attr<bool>("return_inverse");
bool return_counts = context.Attr<bool>("return_counts");
if (axis_vec.empty()) {
UniqueFlattendTensor<T>(context, *x, out, return_index, return_inverse,
return_counts);
} else {
int axis = axis_vec[0];
UniqueDim<DeviceContext, T>(context, *x, out, return_index,
return_inverse, return_counts, axis);
}
}
};
......
......@@ -62,6 +62,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"sync_batch_norm",
{"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
"ReserveSpace"}},
{"unique", {"Out", "Index", "Indices", "Counts"}},
};
// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -564,9 +565,9 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
if (!is_gpu_tensor && !is_xpu_tensor) {
if (!need_deep_copy) {
return py::array(py::buffer_info(
const_cast<void *>(tensor_buf_ptr), sizeof_dtype, py_dtype_str,
static_cast<size_t>(tensor.dims().size()), py_dims, py_strides));
auto base = py::cast(std::move(tensor));
return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
const_cast<void *>(tensor_buf_ptr), base);
} else {
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(
......
......@@ -29,6 +29,8 @@ function(train_test TARGET_NAME)
PROPERTIES DEPENDS test_${TARGET_NAME})
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES LABELS "RUN_TYPE=DIST")
set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES TIMEOUT 150)
endforeach()
endfunction(train_test)
......
......@@ -70,7 +70,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
| `RUN_TEST` | OFF | Run unit test immediently after the build. |
| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
## Docker Images
......@@ -155,21 +154,6 @@ docker push
kubectl ...
```
### Reading source code with woboq codebrowser
For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
```bash
./paddle/scripts/paddle_docker_build.sh html
```
- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
```
docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
```
## More Options
......
......@@ -529,13 +529,16 @@ EOF
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi
ut_startTime_s=`date +%s`
ctest --output-on-failure -j $2
ctest --output-on-failure -j $2;mactest_error=$?
ut_endTime_s=`date +%s`
echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
paddle version
# Recovery proxy to avoid failure in later steps
export http_proxy=$my_proxy
export https_proxy=$my_proxy
if [ "$mactest_error" != 0 ];then
exit 8;
fi
fi
}
......@@ -1105,22 +1108,6 @@ EOF
esac
}
function gen_html() {
cat <<EOF
========================================
Converting C++ source code into HTML ...
========================================
EOF
export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b ${PADDLE_ROOT}/build \
-a \
-o $WOBOQ_OUT \
-p paddle:${PADDLE_ROOT}
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
}
function gen_dockerfile() {
# Set BASE_IMAGE according to env variables
......@@ -1443,9 +1430,6 @@ function main() {
gen_doc_lib)
gen_doc_lib $2
;;
html)
gen_html
;;
dockerfile)
gen_dockerfile ${PYTHON_ABI:-""}
;;
......
......@@ -14,7 +14,7 @@
import paddle
from paddle.distributed.fleet.proto import distributed_strategy_pb2
from paddle.fluid.framework import Variable
from paddle.fluid.framework import Variable, set_flags, core
import google.protobuf.text_format
......@@ -810,6 +810,68 @@ class DistributedStrategy(object):
else:
print("WARNING: auto should have value of bool type")
@property
def cudnn_exhaustive_search(self):
return self.strategy.cudnn_exhaustive_search
@cudnn_exhaustive_search.setter
def cudnn_exhaustive_search(self, flag):
if isinstance(flag, bool):
self.strategy.cudnn_exhaustive_search = flag
else:
print(
"WARNING: cudnn_exhaustive_search should have value of bool type"
)
@property
def conv_workspace_size_limit(self):
return self.strategy.conv_workspace_size_limit
@conv_workspace_size_limit.setter
def conv_workspace_size_limit(self, value):
if isinstance(value, int):
self.strategy.conv_workspace_size_limit = value
else:
print(
"WARNING: conv_workspace_size_limit should have value of int type"
)
@property
def cudnn_batchnorm_spatial_persistent(self):
return self.strategy.cudnn_batchnorm_spatial_persistent
@cudnn_batchnorm_spatial_persistent.setter
def cudnn_batchnorm_spatial_persistent(self, flag):
if isinstance(flag, bool):
self.strategy.cudnn_batchnorm_spatial_persistent = flag
else:
print(
"WARNING: cudnn_batchnorm_spatial_persistent should have value of bool type"
)
def _enable_env(self):
strategy = self.strategy
keys = [
"FLAGS_cudnn_batchnorm_spatial_persistent",
"FLAGS_conv_workspace_size_limit",
"FLAGS_cudnn_exhaustive_search",
"FLAGS_sync_nccl_allreduce",
"FLAGS_fuse_parameter_memory_size",
"FLAGS_fuse_parameter_groups_size",
]
values = [
bool(strategy.cudnn_batchnorm_spatial_persistent),
int(strategy.conv_workspace_size_limit),
bool(strategy.cudnn_exhaustive_search),
bool(strategy.sync_nccl_allreduce),
int(strategy.fuse_grad_size_in_MB),
int(strategy.fuse_grad_size_in_TFLOPS),
]
for i, key in enumerate(keys):
if core.globals().is_public(key):
core.globals()[key] = values[i]
def __repr__(self):
fields = self.strategy.DESCRIPTOR.fields
for f in fields:
......
......@@ -383,6 +383,7 @@ class Fleet(object):
context["valid_strategy"] = valid_strategy
self.valid_strategy = valid_strategy
self.valid_strategy._enable_env()
optimize_ops = []
params_grads = []
......
......@@ -271,6 +271,6 @@ endforeach()
# setting timeout value for old unittests
if(NOT WIN32)
set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 200)
set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250)
set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200)
endif()
......@@ -132,13 +132,16 @@ def _declarative_(dygraph_func):
"""
Converts imperative dygraph APIs into declarative function APIs. Decorator
@declarative handles the Program and Executor of static mode and returns
the result as a dygraph VarBase.
the result as dygraph Tensor(s). Users could use the returned dygraph
Tensor(s) to do imperative training, inference, or other operations. If the
decorated function calls other imperative function, the called one will be
converted into declarative function as well.
Args:
dygraph_func (callable): callable imperative function.
Returns:
VarBase: containing the numerical result.
Tensor(s): containing the numerical result.
Examples:
.. code-block:: python
......@@ -147,6 +150,7 @@ def _declarative_(dygraph_func):
import numpy as np
from paddle.fluid.dygraph.jit import declarative
fluid.enable_dygraph()
@declarative
def func(x):
......
......@@ -110,7 +110,7 @@ def scope_guard(scope):
_switch_scope(ex)
def as_numpy(tensor):
def as_numpy(tensor, copy=False):
"""
Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
......@@ -129,6 +129,7 @@ def as_numpy(tensor):
Args:
tensor(Variable): a instance of Tensor
copy(bool, optional): Whether to use deep copy.
Returns:
numpy.ndarray
......@@ -145,7 +146,10 @@ def as_numpy(tensor):
Please set the parameter 'return_numpy' as 'False' to \
return LoDTensor itself directly.")
if tensor._is_initialized():
return np.array(tensor)
if copy:
return np.array(tensor)
else:
return np.asarray(tensor)
else:
return None
......@@ -350,7 +354,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
" program.")
tensor = var.get_tensor()
if return_numpy:
tensor = as_numpy(tensor)
tensor = as_numpy(tensor, copy=True)
return tensor
......
......@@ -522,9 +522,15 @@ if(NOT WIN32)
endif()
if(NOT APPLE AND NOT WIN32)
parallel_bash_test_modules(test_acp START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint test_auto_checkpoint1 test_auto_checkpoint2 test_auto_checkpoint3)
parallel_bash_test_modules(test_acp2 START_BASH parallel_test.sh TIMEOUT 140 UnitTests test_auto_checkpoint_multiple test_auto_checkpoint_dist_basic)
parallel_bash_test_modules(test_hdfs START_BASH parallel_test.sh TIMEOUT 120 UnitTests test_hdfs1 test_hdfs2 test_hdfs3)
bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
endif()
add_subdirectory(sequence)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest, _set_use_system_allocator
from paddle.fluid.framework import grad_var_name
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
class TestBatchNorm(unittest.TestCase):
def test_name(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
with fluid.dygraph.guard(p):
batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
def test_error(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
#paddle.disable_static()
x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
def error1d():
x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
batch_norm1d = paddle.nn.BatchNorm1d(1)
batch_norm1d(fluid.dygraph.to_variable(x_data_4))
def error2d():
x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
batch_norm2d = paddle.nn.BatchNorm2d(1)
batch_norm2d(fluid.dygraph.to_variable(x_data_3))
def error3d():
x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
batch_norm3d = paddle.nn.BatchNorm3d(1)
batch_norm3d(fluid.dygraph.to_variable(x_data_4))
with fluid.dygraph.guard(p):
self.assertRaises(ValueError, error1d)
self.assertRaises(ValueError, error2d)
self.assertRaises(ValueError, error3d)
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
def compute_v1(x, is_test, trainable_statistics):
with fluid.dygraph.guard(p):
bn = fluid.dygraph.BatchNorm(
shape[1],
is_test=is_test,
trainable_statistics=trainable_statistics)
y = bn(fluid.dygraph.to_variable(x))
return y.numpy()
def compute_v2(x):
with fluid.dygraph.guard(p):
bn = paddle.nn.BatchNorm2d(shape[1])
y = bn(fluid.dygraph.to_variable(x))
return y.numpy()
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x, False, False)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
shape = [4, 10, 16, 16]
def compute_v1(x_np, is_test, trainable_statistics):
with program_guard(Program(), Program()):
bn = fluid.dygraph.BatchNorm(
shape[1],
is_test=is_test,
trainable_statistics=trainable_statistics)
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = bn(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
def compute_v2(x_np):
with program_guard(Program(), Program()):
bn = paddle.nn.BatchNorm2d(shape[1])
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = bn(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x, False, False)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
if __name__ == '__main__':
unittest.main()
......@@ -294,6 +294,28 @@ class TestStrategyConfig(unittest.TestCase):
with self.assertRaises(TypeError):
strategy.unknown_key = 'UNK'
def test_cudnn_exhaustive_search(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.cudnn_exhaustive_search = False
self.assertEqual(strategy.cudnn_exhaustive_search, False)
strategy.cudnn_exhaustive_search = "True"
self.assertEqual(strategy.cudnn_exhaustive_search, False)
def test_cudnn_batchnorm_spatial_persistent(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.cudnn_batchnorm_spatial_persistent = False
self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
strategy.cudnn_batchnorm_spatial_persistent = "True"
self.assertEqual(strategy.cudnn_batchnorm_spatial_persistent, False)
def test_conv_workspace_size_limit(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.conv_workspace_size_limit = 1000
self.assertEqual(strategy.conv_workspace_size_limit, 1000)
strategy.conv_workspace_size_limit = "400"
self.assertEqual(strategy.conv_workspace_size_limit, 1000)
strategy._enable_env()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest, _set_use_system_allocator
from paddle.fluid.framework import grad_var_name
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
class TestDygraphGroupNormv2(unittest.TestCase):
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [2, 6, 2, 2]
def compute_v1(x):
with fluid.dygraph.guard(p):
gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
y = gn(fluid.dygraph.to_variable(x))
return y.numpy()
def compute_v2(x):
with fluid.dygraph.guard(p):
gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
y = gn(fluid.dygraph.to_variable(x))
return y.numpy()
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
shape = [2, 6, 2, 2]
def compute_v1(x_np):
with program_guard(Program(), Program()):
gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = gn(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
def compute_v2(x_np):
with program_guard(Program(), Program()):
gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = gn(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest, _set_use_system_allocator
from paddle.fluid.framework import grad_var_name
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
class TestInstanceNorm(unittest.TestCase):
def test_error(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(
"instance_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
def error1d():
x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
instance_norm1d = paddle.nn.InstanceNorm1d(1)
instance_norm1d(fluid.dygraph.to_variable(x_data_4))
def error2d():
x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
instance_norm2d = paddle.nn.InstanceNorm2d(1)
instance_norm2d(fluid.dygraph.to_variable(x_data_3))
def error3d():
x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
instance_norm3d = paddle.nn.BatchNorm3d(1)
instance_norm3d(fluid.dygraph.to_variable(x_data_4))
with fluid.dygraph.guard(p):
self.assertRaises(ValueError, error1d)
self.assertRaises(ValueError, error2d)
self.assertRaises(ValueError, error3d)
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(
"instance_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
def compute_v1(x):
with fluid.dygraph.guard(p):
bn = fluid.dygraph.InstanceNorm(shape[1])
y = bn(fluid.dygraph.to_variable(x))
return y.numpy()
def compute_v2(x):
with fluid.dygraph.guard(p):
bn = paddle.nn.InstanceNorm2d(shape[1])
y = bn(fluid.dygraph.to_variable(x))
return y.numpy()
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu(
"instance_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
shape = [4, 10, 16, 16]
def compute_v1(x_np):
with program_guard(Program(), Program()):
ins = fluid.dygraph.InstanceNorm(shape[1])
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = ins(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
def compute_v2(x_np):
with program_guard(Program(), Program()):
ins = paddle.nn.InstanceNorm2d(shape[1])
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = ins(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest, _set_use_system_allocator
from paddle.fluid.framework import grad_var_name
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
class TestDygraphLayerNormv2(unittest.TestCase):
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
def compute_v1(x):
with fluid.dygraph.guard(p):
ln = fluid.dygraph.LayerNorm(shape[1:])
y = ln(fluid.dygraph.to_variable(x))
return y.numpy()
def compute_v2(x):
with fluid.dygraph.guard(p):
ln = paddle.nn.LayerNorm(shape[1:])
y = ln(fluid.dygraph.to_variable(x))
return y.numpy()
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
shape = [4, 10, 16, 16]
def compute_v1(x_np):
with program_guard(Program(), Program()):
ln = fluid.dygraph.LayerNorm(shape[1:])
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = ln(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
def compute_v2(x_np):
with program_guard(Program(), Program()):
ln = paddle.nn.LayerNorm(shape[1:])
x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
y = ln(x)
exe.run(fluid.default_startup_program())
r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
return r
x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x)
y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2))
if __name__ == '__main__':
unittest.main()
......@@ -44,8 +44,8 @@ class TestMultiplyAPI(unittest.TestCase):
def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
paddle.disable_static()
x = paddle.to_variable(x_data)
y = paddle.to_variable(y_data)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
res = paddle.multiply(x, y, axis=axis)
return res.numpy()
......@@ -126,17 +126,31 @@ class TestMultiplyError(unittest.TestCase):
paddle.disable_static()
x_data = np.random.randn(200).astype(np.int8)
y_data = np.random.randn(200).astype(np.int8)
x = paddle.to_variable(x_data)
y = paddle.to_variable(y_data)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
# test dynamic computation graph: inputs must be broadcastable
x_data = np.random.rand(200, 5)
y_data = np.random.rand(200)
x = paddle.to_variable(x_data)
y = paddle.to_variable(y_data)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
# test dynamic computation graph: inputs must be broadcastable(python)
x_data = np.random.rand(200, 5)
y_data = np.random.rand(200)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
# test dynamic computation graph: dtype must be same
x_data = np.random.randn(200).astype(np.int64)
y_data = np.random.randn(200).astype(np.float64)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
self.assertRaises(TypeError, paddle.multiply, x, y)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid.core as core
def numpy_topk(x, k=1, axis=-1, largest=True):
if axis < 0:
axis = len(x.shape) + axis
if largest:
indices = np.argsort(-x, axis=axis)
else:
indices = np.argsort(x, axis=axis)
if largest:
value = -np.sort(-x, axis=axis)
else:
value = np.sort(x, axis=axis)
indices = indices.take(indices=range(0, k), axis=axis)
value = value.take(indices=range(0, k), axis=axis)
return value, indices
class TestTopkOp(OpTest):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
def setUp(self):
self.op_type = "top_k_v2"
self.dtype = np.float64
self.input_data = np.random.rand(10, 20)
self.init_args()
self.inputs = {'X': self.input_data}
self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
output, indices = numpy_topk(
self.input_data, axis=self.axis, k=self.k, largest=self.largest)
self.outputs = {'Out': output, 'Indices': indices}
def test_check_output(self):
paddle.enable_static()
self.check_output()
def test_check_grad(self):
paddle.enable_static()
self.check_grad(set(['X']), 'Out')
class TestTopOp1(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 0
self.largest = True
class TestTopOp2(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 0
self.largest = False
class TestTopOp3(TestTopkOp):
def init_args(self):
self.k = 4
self.axis = 0
self.largest = False
class TestTopOp4(TestTopkOp):
def init_args(self):
self.k = 4
self.axis = 0
self.largest = False
class TestTopkOp5(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
def setUp(self):
self.op_type = "top_k_v2"
self.dtype = np.float64
self.input_data = np.random.rand(10, 10, 5)
self.init_args()
self.inputs = {'X': self.input_data}
self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
output, indices = numpy_topk(
self.input_data, axis=self.axis, k=self.k, largest=self.largest)
self.outputs = {'Out': output, 'Indices': indices}
class TestTopkOp6(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
def setUp(self):
self.op_type = "top_k_v2"
self.dtype = np.float64
self.input_data = np.random.rand(10, 10, 5)
self.init_args()
self.inputs = {'X': self.input_data}
self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
output, indices = numpy_topk(
self.input_data, axis=self.axis, k=self.k, largest=self.largest)
self.outputs = {'Out': output, 'Indices': indices}
class TestTopKAPI(unittest.TestCase):
def setUp(self):
np.random.seed(123)
self.input_data = np.random.rand(6, 7, 8)
self.large_input_data = np.random.rand(2, 1030)
def run_dygraph(self, place):
paddle.disable_static(place)
input_tensor = paddle.to_tensor(self.input_data)
large_input_tensor = paddle.to_tensor(self.large_input_data)
# test case for basic test case 1
paddle_result = paddle.topk(input_tensor, k=2)
numpy_result = numpy_topk(self.input_data, k=2)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 2 with axis
paddle_result = paddle.topk(input_tensor, k=2, axis=1)
numpy_result = numpy_topk(self.input_data, k=2, axis=1)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 3 with tensor K
k_tensor = paddle.to_tensor(np.array([2]))
paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
numpy_result = numpy_topk(self.input_data, k=2, axis=1)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 4 with tensor largest
k_tensor = paddle.to_tensor(np.array([2]))
paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 5 with axis -1
k_tensor = paddle.to_tensor(np.array([2]))
paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 6 for the partial sort
paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
# test case for basic test case 7 for the unsorted
paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
sort_paddle = numpy_topk(
np.array(paddle_result[0].numpy()), axis=1, k=2)
numpy_result = numpy_topk(self.input_data, k=2, axis=1)
self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
def run_static(self, place):
paddle.enable_static()
with paddle.static.program_guard(paddle.static.Program(),
paddle.static.Program()):
input_tensor = paddle.static.data(
name="x", shape=[6, 7, 8], dtype="float64")
large_input_tensor = paddle.static.data(
name="large_x", shape=[2, 1030], dtype="float64")
k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
result1 = paddle.topk(input_tensor, k=2)
result2 = paddle.topk(input_tensor, k=2, axis=-1)
result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
exe = paddle.static.Executor(place)
input_data = np.random.rand(10, 20).astype("float64")
large_input_data = np.random.rand(2, 100).astype("float64")
paddle_result = exe.run(
feed={
"x": self.input_data,
"large_x": self.large_input_data,
"k": np.array([2]).astype("int32")
},
fetch_list=[
result1[0], result1[1], result2[0], result2[1], result3[0],
result3[1], result4[0], result4[1], result5[0], result5[1],
result6[0], result6[1], result7[0], result7[1]
])
numpy_result = numpy_topk(self.input_data, k=2)
self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
numpy_result = numpy_topk(self.input_data, k=2, axis=1)
self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
numpy_result = numpy_topk(
self.input_data, k=2, axis=1, largest=False)
self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
numpy_result = numpy_topk(
self.input_data, k=2, axis=-1, largest=False)
self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
numpy_result = numpy_topk(self.input_data, k=2, axis=1)
self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
def test_cases(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.run_dygraph(place)
self.run_static(place)
if __name__ == "__main__":
unittest.main()
......@@ -17,6 +17,7 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
......@@ -125,5 +126,164 @@ class TestRandomGPU(TestUniqueOp):
self.check_output_with_place(place, atol=1e-5)
class TestSortedUniqueOp(TestUniqueOp):
def init_config(self):
self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
unique, indices, inverse, count = np.unique(
self.inputs['X'],
return_index=True,
return_inverse=True,
return_counts=True,
axis=None)
self.attrs = {
'dtype': int(core.VarDesc.VarType.INT32),
"return_index": True,
"return_inverse": True,
"return_counts": True,
"axis": None,
"is_sorted": True
}
self.outputs = {
'Out': unique,
'Indices': indices,
"Index": inverse,
"Counts": count,
}
class TestUniqueOpAxisNone(TestUniqueOp):
def init_config(self):
self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
unique, indices, inverse, counts = np.unique(
self.inputs['X'],
return_index=True,
return_inverse=True,
return_counts=True,
axis=None)
self.attrs = {
'dtype': int(core.VarDesc.VarType.INT32),
"return_index": True,
"return_inverse": True,
"return_counts": True,
"axis": None,
"is_sorted": True
}
self.outputs = {
'Out': unique,
'Indices': indices,
"Index": inverse,
"Counts": counts,
}
class TestUniqueOpAxis1(TestUniqueOp):
def init_config(self):
self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
unique, indices, inverse, counts = np.unique(
self.inputs['X'],
return_index=True,
return_inverse=True,
return_counts=True,
axis=1)
self.attrs = {
'dtype': int(core.VarDesc.VarType.INT32),
"return_index": True,
"return_inverse": True,
"return_counts": True,
"axis": [1],
"is_sorted": True
}
self.outputs = {
'Out': unique,
'Indices': indices,
"Index": inverse,
"Counts": counts,
}
class TestUniqueAPI(unittest.TestCase):
def test_dygraph_api_out(self):
paddle.disable_static()
x_data = x_data = np.random.randint(0, 10, (120))
x = paddle.to_tensor(x_data)
out = paddle.unique(x)
expected_out = np.unique(x_data)
self.assertTrue((out.numpy() == expected_out).all(), True)
paddle.enable_static()
def test_dygraph_api_attr(self):
paddle.disable_static()
x_data = np.random.random((3, 5, 5)).astype("float32")
x = paddle.to_tensor(x_data)
out, index, inverse, counts = paddle.unique(
x,
return_index=True,
return_inverse=True,
return_counts=True,
axis=0)
np_out, np_index, np_inverse, np_counts = np.unique(
x_data,
return_index=True,
return_inverse=True,
return_counts=True,
axis=0)
self.assertTrue((out.numpy() == np_out).all(), True)
self.assertTrue((index.numpy() == np_index).all(), True)
self.assertTrue((inverse.numpy() == np_inverse).all(), True)
self.assertTrue((counts.numpy() == np_counts).all(), True)
paddle.enable_static()
def test_static_graph(self):
with paddle.static.program_guard(paddle.static.Program(),
paddle.static.Program()):
x = paddle.data(name='x', shape=[3, 2], dtype='float64')
unique, inverse, counts = paddle.unique(
x, return_inverse=True, return_counts=True, axis=0)
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
result = exe.run(feed={"x": x_np},
fetch_list=[unique, inverse, counts])
np_unique, np_inverse, np_counts = np.unique(
x_np, return_inverse=True, return_counts=True, axis=0)
self.assertTrue(np.allclose(result[0], np_unique))
self.assertTrue(np.allclose(result[1], np_inverse))
self.assertTrue(np.allclose(result[2], np_counts))
class TestUniqueError(unittest.TestCase):
def test_input_dtype(self):
def test_x_dtype():
with paddle.static.program_guard(paddle.static.Program(),
paddle.static.Program()):
x = paddle.data(name='x', shape=[10, 10], dtype='float16')
result = paddle.unique(x)
self.assertRaises(TypeError, test_x_dtype)
def test_attr(self):
x = paddle.data(name='x', shape=[10, 10], dtype='float64')
def test_return_index():
result = paddle.unique(x, return_index=0)
self.assertRaises(TypeError, test_return_index)
def test_return_inverse():
result = paddle.unique(x, return_inverse='s')
self.assertRaises(TypeError, test_return_inverse)
def test_return_counts():
result = paddle.unique(x, return_counts=3)
self.assertRaises(TypeError, test_return_counts)
def test_axis():
result = paddle.unique(x, axis='12')
self.assertRaises(TypeError, test_axis)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -26,17 +26,22 @@ from collections import Iterable
import paddle
from paddle import fluid
from paddle.fluid import core
from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
# Note: Use alias `Input` temporarily before releasing hapi feature.
from paddle.static import InputSpec as Input
from paddle.fluid.framework import in_dygraph_mode, Variable
from paddle.fluid.executor import global_scope
from paddle.fluid.io import is_belong_to_optimizer
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
from paddle.fluid.layers.utils import flatten
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
from paddle.fluid.incubate.fleet.base import role_maker
from paddle.fluid.executor import scope_guard, Executor
from paddle.io import DataLoader, Dataset
from paddle.fluid.dygraph.layers import Layer
from paddle.metric import Metric
from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
......@@ -846,24 +851,32 @@ class Model(object):
"""
return self._adapter.test_batch(inputs)
def save(self, path):
"""
This function saves parameters, optimizer infomation to path.
def save(self, path, training=True):
"""
This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter
`training`.
The parameters contains all the trainable Variable, will save to
a file with suffix ".pdparams".
If `training` is set to True, the parameters saved contain all
the trainable Variable, will save to a file with suffix ".pdparams".
The optimizer information contains all the variable used by optimizer.
For Adam optimizer, contains beta1, beta2, momentum etc. All the
information will save to a file with suffix ".pdopt". (If the optimizer
have no variable need to save (like SGD), the fill will not generated).
This function will silently overwrite existing file at the target location.
This function will silently overwrite existing file
at the target location.
If `training` is set to False, only inference model will be saved. It
should be noted that before using `save`, you should run the model, and
the shape of input you saved is as same as the input of its running.
`@paddle.jit.to_static` must be added on `forward` function of your layer
in dynamic mode now and these will be optimized later.
Args:
path (str): The file prefix to save model. The format is
'dirname/file_prefix' or 'file_prefix'. if empty str. A exception
will be raised.
training (bool, optional): Whether to save for training. If not, save
for inference only. Default: True.
Returns:
None
......@@ -871,25 +884,47 @@ class Model(object):
Examples:
.. code-block:: python
import paddle
import paddle.incubate.hapi as hapi
class MyNet(paddle.nn.Layer):
def __init__(self):
super(MyNet, self).__init__()
self._fc = paddle.nn.Linear(784, 1, act='softmax')
import paddle
import paddle.incubate.hapi as hapi
from paddle.nn import Linear
from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
class Mnist(paddle.nn.Layer):
def __init__(self):
super(MyNet, self).__init__()
self._fc = Linear(784, 1, act='softmax')
@paddle.jit.to_static # If save for inference in dygraph, need this
def forward(self, x):
y = self._fc(x)
return y
device = hapi.set_device('cpu')
paddle.disable_static(device)
model = hapi.Model(MyNet())
model.save('checkpoint/test')
dynamic = True # False
device = hapi.set_device('cpu')
# if use static graph, do not set
paddle.disable_static(device) if dynamic else None
# inputs and labels are not required for dynamic graph.
input = hapi.Input([None, 784], 'float32', 'x')
label = hapi.Input([None, 1], 'int64', 'label')
model = hapi.Model(Mnist(), input, label)
optim = paddle.optimizer.SGD(learning_rate=1e-3,
parameter_list=model.parameters())
model.prepare(optim,
paddle.nn.CrossEntropyLoss(),
hapi.metrics.Accuracy())
mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference
"""
if ParallelEnv().local_rank == 0:
self._adapter.save(path)
if not training:
self._save_inference_model(path)
else:
self._adapter.save(path)
def load(self, path, skip_mismatch=False, reset_optimizer=False):
"""
......@@ -1474,13 +1509,17 @@ class Model(object):
cbks.on_end('test', logs)
return outputs
def save_inference_model(self,
save_dir,
model_filename=None,
params_filename=None,
model_only=False):
def _save_inference_model(self,
save_dir,
model_filename=None,
params_filename=None,
model_only=False):
"""
Save inference model must in static mode.
Save inference model can be in static or dynamic mode.
It should be noted that before using `save_inference_model`, you should
run the model, and the shape you saved is as same as the input of its
running. `@paddle.jit.to_static` must be added on `forward` function of
your layer in dynamic mode now and these will be optimized later.
Args:
save_dir (str): The directory path to save the inference model.
......@@ -1496,39 +1535,145 @@ class Model(object):
Returns:
list: The fetch variables' name list
Examples:
.. code-block:: python
import numpy as np
import paddle
from paddle.static import InputSpec
import paddle.incubate.hapi as hapi
input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
model = hapi.Model(hapi.vision.LeNet(), input)
model.prepare()
from paddle.nn import Linear
from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
class Mnist(Layer):
def __init__(self, classifier_act=None):
super(Mnist, self).__init__()
self.fc = Linear(input_dim=784, output_dim=10, act="softmax")
@paddle.jit.to_static # In static mode, you need to delete this.
def forward(self, inputs):
outputs = self.fc(inputs)
return outputs
dynamic = True # False
device = hapi.set_device('gpu')
# if use static graph, do not set
paddle.disable_static(device) if dynamic else None
# inputs and labels are not required for dynamic graph.
input = InputSpec([None, 784], 'float32', 'x')
label = InputSpec([None, 1], 'int64', 'label')
model = hapi.Model(Mnist(), input, label)
optim = paddle.optimizer.SGD(learning_rate=1e-3,
parameter_list=model.parameters())
model.prepare(optim,
paddle.nn.CrossEntropyLoss(),
hapi.metrics.Accuracy())
mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
model.save_inference_model('inference_model')
"""
assert not fluid.in_dygraph_mode(
), 'Save inference model must in static mode!'
prog = self._adapter._progs.get('test', None)
assert prog, \
"Model is not ready, please call `model.prepare()` first"
def get_inout_spec(all_vars, return_name=False):
result_list = []
valid_vars = [var for var in all_vars if isinstance(var, Variable)]
result_list = valid_vars
if return_name:
result_list = [var.name for var in result_list]
infer_prog = prog.clone(for_test=True)
return result_list
input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output']
# TODO:
# 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
# 2. Save correct shape of input, now the interface stores the shape that the user sent to
# the inputs of the model in running.
# 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
if fluid.in_dygraph_mode():
layer = self.network
fluid.disable_dygraph()
# 1. input check
prog_translator = ProgramTranslator()
if not prog_translator.enable_declarative:
raise RuntimeError(
"save_inference_model doesn't work when setting ProgramTranslator.enable=False."
)
if not isinstance(layer, Layer):
raise TypeError(
"The input layer should be 'Layer', but received layer type is %s."
% type(layer))
# 2. get program of declarative Layer.forward
prog_cache = prog_translator.get_program_cache()
# make dummy args & kwargs, to get excepted FunctionSpec
layer_func = FunctionSpec(type(layer).forward, [layer], {})
concrete_program, _ = prog_cache.get_program(layer_func)
# NOTE: we maintain the mapping of variable name to
# structured name, the buffer variable (non-persistable)
# saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name
state_names_dict = dict()
for structured_name, var in layer.state_dict().items():
state_names_dict[var.name] = structured_name
# 3. share parameters from Layer to scope & record var info
scope = core.Scope()
extra_var_info = dict()
for param_or_buffer in concrete_program.parameters:
# share to scope
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
src_tensor = param_or_buffer.value().get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
# record var info
extra_info_dict = dict()
if param_or_buffer.name in state_names_dict:
extra_info_dict['structured_name'] = state_names_dict[
param_or_buffer.name]
extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
if isinstance(param_or_buffer, ParamBase):
extra_info_dict['trainable'] = param_or_buffer.trainable
extra_var_info[param_or_buffer.name] = extra_info_dict
# 4. build input & output spec
input_var_names = get_inout_spec(concrete_program.inputs, True)
output_vars = get_inout_spec(concrete_program.outputs)
# 5. save inference model
with scope_guard(scope):
return fluid.io.save_inference_model(
dirname=save_dir,
feeded_var_names=input_var_names,
target_vars=output_vars,
executor=Executor(_current_expected_place()),
main_program=concrete_program.main_program.clone(),
model_filename=model_filename,
params_filename=params_filename,
program_only=model_only)
return fluid.io.save_inference_model(
save_dir,
input_names,
endpoints,
self._adapter._executor,
main_program=infer_prog,
model_filename=model_filename,
params_filename=params_filename,
program_only=model_only)
else:
prog = self._adapter._progs.get('test', None)
assert prog, \
"Model is not ready, please call `model.prepare()` first"
infer_prog = prog.clone(for_test=True)
input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output']
return fluid.io.save_inference_model(
save_dir,
input_names,
endpoints,
self._adapter._executor,
main_program=infer_prog,
model_filename=model_filename,
params_filename=params_filename,
program_only=model_only)
def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
outputs = []
......
......@@ -12,6 +12,7 @@ endforeach()
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150)
function(py_dist_test TARGET_NAME)
......
......@@ -33,6 +33,8 @@ from paddle.metric import Accuracy
from paddle.incubate.hapi.datasets import MNIST
from paddle.incubate.hapi.vision.models import LeNet
from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
from paddle.fluid.dygraph.jit import declarative
from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
class LeNetDygraph(fluid.dygraph.Layer):
......@@ -65,6 +67,37 @@ class LeNetDygraph(fluid.dygraph.Layer):
return x
class LeNetDeclarative(fluid.dygraph.Layer):
def __init__(self, num_classes=10, classifier_activation=None):
super(LeNetDeclarative, self).__init__()
self.num_classes = num_classes
self.features = Sequential(
Conv2d(
1, 6, 3, stride=1, padding=1),
ReLU(),
Pool2D(2, 'max', 2),
Conv2d(
6, 16, 5, stride=1, padding=0),
ReLU(),
Pool2D(2, 'max', 2))
if num_classes > 0:
self.fc = Sequential(
Linear(400, 120),
Linear(120, 84),
Linear(
84, 10, act=classifier_activation))
@declarative
def forward(self, inputs):
x = self.features(inputs)
if self.num_classes > 0:
x = fluid.layers.flatten(x, 1)
x = self.fc(x)
return x
class MnistDataset(MNIST):
def __init__(self, mode, return_label=True, sample_num=None):
super(MnistDataset, self).__init__(mode=mode)
......@@ -335,7 +368,6 @@ class TestModelFunction(unittest.TestCase):
model = Model(net, inputs, labels)
model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
loss, = model.train_batch([data], [label])
np.testing.assert_allclose(loss.flatten(), ref.flatten())
fluid.disable_dygraph() if dynamic else None
......@@ -445,33 +477,38 @@ class TestModelFunction(unittest.TestCase):
fluid.disable_dygraph() if dynamic else None
def test_export_deploy_model(self):
net = LeNet()
inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
model = Model(net, inputs)
model.prepare()
save_dir = tempfile.mkdtemp()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
tensor_img = np.array(
np.random.random((1, 1, 28, 28)), dtype=np.float32)
ori_results = model.test_batch(tensor_img)
model.save_inference_model(save_dir)
place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
[inference_program, feed_target_names, fetch_targets] = (
fluid.io.load_inference_model(
dirname=save_dir, executor=exe))
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
for dynamic in [True, False]:
fluid.enable_dygraph() if dynamic else None
# paddle.disable_static() if dynamic else None
prog_translator = ProgramTranslator()
prog_translator.enable(False) if not dynamic else None
net = LeNetDeclarative()
inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
model = Model(net, inputs)
model.prepare()
save_dir = tempfile.mkdtemp()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
tensor_img = np.array(
np.random.random((1, 1, 28, 28)), dtype=np.float32)
ori_results = model.test_batch(tensor_img)
model.save(save_dir, training=False)
fluid.disable_dygraph() if dynamic else None
np.testing.assert_allclose(results, ori_results, rtol=1e-6)
shutil.rmtree(save_dir)
place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
new_scope = fluid.Scope()
with fluid.scope_guard(new_scope):
exe = fluid.Executor(place)
[inference_program, feed_target_names, fetch_targets] = (
fluid.io.load_inference_model(
dirname=save_dir, executor=exe))
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
np.testing.assert_allclose(
results, ori_results, rtol=1e-5, atol=1e-7)
shutil.rmtree(save_dir)
class TestRaiseError(unittest.TestCase):
......
......@@ -128,6 +128,12 @@ from .layer.norm import GroupNorm #DEFINE_ALIAS
from .layer.norm import LayerNorm #DEFINE_ALIAS
from .layer.norm import SpectralNorm #DEFINE_ALIAS
from .layer.norm import InstanceNorm #DEFINE_ALIAS
from .layer.norm import InstanceNorm1d #DEFINE_ALIAS
from .layer.norm import InstanceNorm2d #DEFINE_ALIAS
from .layer.norm import InstanceNorm3d #DEFINE_ALIAS
from .layer.norm import BatchNorm1d #DEFINE_ALIAS
from .layer.norm import BatchNorm2d #DEFINE_ALIAS
from .layer.norm import BatchNorm3d #DEFINE_ALIAS
# from .layer.rnn import RNNCell #DEFINE_ALIAS
# from .layer.rnn import GRUCell #DEFINE_ALIAS
# from .layer.rnn import LSTMCell #DEFINE_ALIAS
......
......@@ -160,12 +160,12 @@ from .loss import square_error_cost #DEFINE_ALIAS
from .loss import ssd_loss #DEFINE_ALIAS
from .loss import teacher_student_sigmoid_loss #DEFINE_ALIAS
from .loss import ctc_loss #DEFINE_ALIAS
# from .norm import batch_norm #DEFINE_ALIAS
# from .norm import data_norm #DEFINE_ALIAS
# from .norm import group_norm #DEFINE_ALIAS
# from .norm import instance_norm #DEFINE_ALIAS
from .norm import l2_normalize #DEFINE_ALIAS
# from .norm import layer_norm #DEFINE_ALIAS
from .norm import batch_norm #DEFINE_ALIAS
from .norm import instance_norm #DEFINE_ALIAS
from .norm import layer_norm #DEFINE_ALIAS
from .norm import lrn #DEFINE_ALIAS
from .norm import normalize #DEFINE_ALIAS
# from .norm import spectral_norm #DEFINE_ALIAS
......
......@@ -18,16 +18,19 @@ import paddle.fluid as fluid
from ...fluid.data_feeder import check_variable_and_dtype, check_type
from ...fluid.layer_helper import LayerHelper
from ...fluid.framework import in_dygraph_mode, core
from ...framework import create_parameter
from ...fluid.layers import l2_normalize #DEFINE_ALIAS
from ...fluid.layers import lrn #DEFINE_ALIAS
from ...fluid.initializer import Constant
from ...fluid.param_attr import ParamAttr
from ...fluid import core, dygraph_utils
__all__ = [
# 'batch_norm',
'batch_norm',
# 'data_norm',
# 'group_norm',
# 'instance_norm',
'instance_norm',
'l2_normalize',
# 'layer_norm',
'layer_norm',
'lrn',
'normalize',
# 'spectral_norm'
......@@ -110,3 +113,286 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
eps = out.block.create_var(dtype=out.dtype)
paddle.fill_constant([1], out.dtype, epsilon, out=eps)
return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
def batch_norm(x,
running_mean,
running_var,
weight,
bias,
training=False,
momentum=0.9,
epsilon=1e-05,
data_format="NCHW",
name=None):
"""
Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
Parameters:
x(Tesnor): input value. It's data type should be float32, float64.
running_mean(Tensor): running mean.
running_var(Tensor): running variance.
weight(Tensor): The weight tensor of batch_norm, can not be None.
bias(Tensor): The bias tensor of batch_norm can not be None.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Returns:
None
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
x = np.random.seed(123)
x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
running_mean = np.random.random(size=1).astype('float32')
running_variance = np.random.random(size=1).astype('float32')
weight_data = np.random.random(size=1).astype('float32')
bias_data = np.random.random(size=1).astype('float32')
x = paddle.to_tensor(x)
rm = paddle.to_tensor(running_mean)
rv = paddle.to_tensor(running_variance)
w = paddle.to_tensor(weight_data)
b = paddle.to_tensor(bias_data)
batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
print batch_norm_out
"""
assert len(x.shape) >= 2, "input dim must be larger than 1"
# we use not training means use_global_status, more details see nn._BatchNormBase
use_global_stats = not training
# input ad out must share the memory
mean_out = running_mean
variance_out = running_var
if in_dygraph_mode():
# for dygraph need tuple
attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
data_format, "use_mkldnn", False, "fuse_with_relu", False,
"use_global_stats", use_global_stats)
batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
x, weight, bias, running_mean, running_var, mean_out, variance_out,
*attrs)
return dygraph_utils._append_activation_in_dygraph(
batch_norm_out, act=None)
check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
'BatchNorm')
# for static need dict
attrs = {
"momentum": momentum,
"epsilon": epsilon,
"data_layout": data_format,
"use_mkldnn": False,
"fuse_with_relu": False,
"use_global_stats": use_global_stats,
}
inputs = {
"X": [x],
"Scale": [weight],
"Bias": [bias],
"Mean": [running_mean],
"Variance": [running_var]
}
helper = LayerHelper('batch_norm', **locals())
dtype = x.dtype if x.dtype is not 'float16' else 'float32'
saved_mean = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
saved_variance = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
batch_norm_out = helper.create_variable_for_type_inference(dtype)
outputs = {
"Y": [batch_norm_out],
"MeanOut": [running_mean],
"VarianceOut": [running_var],
"SavedMean": [saved_mean],
"SavedVariance": [saved_variance]
}
helper.append_op(
type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
return helper.append_activation(batch_norm_out)
def layer_norm(x,
normalized_shape,
weight=None,
bias=None,
epsilon=1e-05,
name=None):
"""
see more detail in paddle.nn.LayerNorm
Parameters:
x(Tensor): Input Tensor. It's data type should be float32, float64.
normalized_shape(int|list|tuple): Input shape from an expected input of
size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
If it is a single integer, this module will normalize over the last dimension
which is expected to be of that specific size.
epsilon(float, optional): The small value added to the variance to prevent
division by zero. Default: 1e-05.
weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Returns:
None
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
np.random.seed(123)
x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
layer_norm_out = layer_norm(x)
print(layer_norm_out.numpy)
"""
input_shape = list(x.shape)
input_ndim = len(input_shape)
normalized_ndim = len(normalized_shape)
begin_norm_axis = input_ndim - normalized_ndim
if input_ndim < normalized_ndim or input_shape[
begin_norm_axis:] != normalized_shape:
str_normalized_shape = str(normalized_shape)
raise ValueError('Given normalized_shape is ' + str_normalized_shape +
', expected input with shape [*, ' +
str_normalized_shape[
1:] + ', but got input shape ' + str(input_shape))
if in_dygraph_mode():
pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
'begin_norm_axis', begin_norm_axis)
return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
inputs = dict()
inputs['X'] = [x]
if weight:
inputs['Scale'] = [weight]
if bias:
inputs['Bias'] = [bias]
attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
# create output
helper = LayerHelper('layer_norm', **locals())
mean_out = helper.create_variable_for_type_inference(
dtype=x.type, stop_gradient=True)
variance_out = helper.create_variable_for_type_inference(
dtype=x.type, stop_gradient=True)
layer_norm_out = helper.create_variable_for_type_inference(x.type)
helper.append_op(
type="layer_norm",
inputs=inputs,
outputs={
"Y": layer_norm_out,
"Mean": mean_out,
"Variance": variance_out,
},
attrs={"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis})
return helper.append_activation(layer_norm_out)
def instance_norm(x,
running_mean=None,
running_var=None,
weight=None,
bias=None,
use_input_stats=True,
momentum=0.9,
eps=1e-05,
data_format="NCHW",
name=None):
"""
See more detail in nn.layer.InstanceNorm2d.
Parameters:
x(Tensor): Input Tensor. It's data type should be float32, float64.
running_mean(Tensor): running mean. Default None.
running_var(Tensor): running variance. Default None.
weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
use_input_stats(bool): Default True.
data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Returns:
None.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
np.random.seed(123)
x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
x = paddle.to_tensor(x_data)
instance_norm_out = paddle.nn.functional.instancenorm(x)
print(instance_norm_out.numpy)
"""
if in_dygraph_mode():
out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
"momentum", momentum, "data_format",
data_format)
return out
check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
if weight and bias:
inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
else:
inputs = {"X": [x]}
helper = LayerHelper('instance_norm', **locals())
saved_mean = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True)
saved_variance = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True)
instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
outputs = {
"Y": [instance_norm_out],
"SavedMean": [saved_mean],
"SavedVariance": [saved_variance]
}
helper.append_op(
type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
return instance_norm_out
......@@ -41,6 +41,7 @@ from ...fluid import core
from ...fluid.framework import in_dygraph_mode
from ...fluid.param_attr import ParamAttr
from ...fluid.initializer import Constant
from paddle.framework import get_default_dtype
from .. import functional as F
......@@ -423,7 +424,7 @@ class PReLU(layers.Layer):
For more information, please refer to :ref:`api_guide_Name`.
Shape:
- input: Tensor with any shape.
- input: Tensor with any shape. Default dtype is float32.
- output: Tensor with the same shape as input.
Examples:
......@@ -433,13 +434,14 @@ class PReLU(layers.Layer):
import numpy as np
paddle.disable_static()
paddle.set_default_dtype("float64")
data = np.array([[[[-2.0, 3.0, -4.0, 5.0],
[ 3.0, -4.0, 5.0, -6.0],
[-7.0, -8.0, 8.0, 9.0]],
[[ 1.0, -2.0, -3.0, 4.0],
[-5.0, 6.0, 7.0, -8.0],
[ 6.0, 7.0, 8.0, 9.0]]]], 'float32')
[ 6.0, 7.0, 8.0, 9.0]]]], 'float64')
x = paddle.to_tensor(data)
m = paddle.nn.PReLU(1, 0.25)
out = m(x)
......@@ -461,10 +463,10 @@ class PReLU(layers.Layer):
self._weight = self.create_parameter(
attr=self._weight_attr,
shape=[num_parameters],
dtype='float32',
shape=[self._num_parameters],
dtype=get_default_dtype(),
is_bias=False,
default_initializer=Constant(init))
default_initializer=Constant(self._init))
def forward(self, x):
return F.prelu(x, self._weight)
......
此差异已折叠。
......@@ -27,7 +27,6 @@ from ..fluid.layers import expand_as #DEFINE_ALIAS
from ..fluid.layers import slice #DEFINE_ALIAS
from ..fluid.layers import strided_slice #DEFINE_ALIAS
from ..fluid.layers import transpose #DEFINE_ALIAS
from ..fluid.layers import unique #DEFINE_ALIAS
from ..fluid.layers import unstack #DEFINE_ALIAS
from ..fluid.layers import scatter_nd_add #DEFINE_ALIAS
......@@ -608,6 +607,126 @@ def squeeze(x, axis=None, name=None):
return layers.squeeze(x, axis, name)
def unique(x,
return_index=False,
return_inverse=False,
return_counts=False,
axis=None,
name=None):
"""
Returns the unique elements of `x` in ascending order.
Args:
x(Tensor): The input tensor, it's data type should be float32, float64, int32, int64.
return_index(bool, optional): If True, also return the indices of the input tensor that
result in the unique Tensor.
return_inverse(bool, optional): If True, also return the indices for where elements in
the original input ended up in the returned unique tensor.
return_counts(bool, optional): If True, also return the counts for each unique element.
axis(int, optional): The axis to apply unique. If None, the input will be flattened.
Default: None.
name(str, optional): Name for the operation. For more information, please refer to
:ref:`api_guide_Name`. Default: None.
Returns:
tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
is True. `counts` is provided only if `return_counts` is True.
Examples:
.. code-block:: python
import numpy as np
import paddle
paddle.disable_static()
x_data = np.array([2, 3, 3, 1, 5, 3])
x = paddle.to_tensor(x_data)
unique = paddle.unique(x)
np_unique = unique.numpy() # [1 2 3 5]
_, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
np_indices = indices.numpy() # [3 0 1 4]
np_inverse = inverse.numpy() # [1 2 2 0 3 2]
np_counts = counts.numpy() # [1 1 3 1]
x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
unique = paddle.unique(x)
np_unique = unique.numpy() # [0 1 2 3]
unique = paddle.unique(x, axis=0)
np_unique = unique.numpy()
# [[2 1 3]
# [3 0 1]]
"""
if axis is None:
axis = []
else:
axis = [axis]
if in_dygraph_mode():
out, inverse, indices, counts = core.ops.unique(
x, 'dtype',
convert_np_dtype_to_dtype_('int32'), 'return_index', return_index,
'return_inverse', return_inverse, 'return_counts', return_counts,
'axis', axis, "is_sorted", True)
outs = [out]
if return_index:
outs.append(indices)
if return_inverse:
outs.append(inverse)
if return_counts:
outs.append(counts)
if len(outs) == 1:
return outs[0]
return tuple(outs)
check_variable_and_dtype(x, "input",
['float32', 'float64', 'int32', 'int64'], 'unique')
check_type(return_index, 'return_index', bool, 'unique')
check_type(return_inverse, 'return_inverse', bool, 'unique')
check_type(return_counts, 'return_counts', bool, 'unique')
if len(axis) != 0:
check_type(axis[0], 'axis', int, 'unique')
helper = LayerHelper('unique', **locals())
attrs = {
'dtype': int(core.VarDesc.VarType.INT32),
"return_index": return_index,
"return_inverse": return_inverse,
"return_counts": return_counts,
"axis": axis,
"is_sorted": True
}
out = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True)
inverse = helper.create_variable_for_type_inference(
dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
outputs = {"Out": out, "Index": inverse}
outs = [out]
if return_index:
indices = helper.create_variable_for_type_inference(
dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
outputs["Indices"] = indices
outs.append(indices)
if return_inverse:
outs.append(inverse)
if return_counts:
counts = helper.create_variable_for_type_inference(
dtype=core.VarDesc.VarType.INT64, stop_gradient=True)
outputs["Counts"] = counts
outs.append(counts)
helper.append_op(
type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
if len(outs) == 1:
return outs[0]
return tuple(outs)
def unsqueeze(x, axis, name=None):
"""
:alias_main: paddle.unsqueeze
......
......@@ -562,34 +562,52 @@ floor_mod = remainder #DEFINE_ALIAS
def multiply(x, y, axis=-1, name=None):
"""
:alias_main: paddle.multiply
:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
multiply two tensors element-wise. The equation is:
Examples:
.. math::
out = x * y
.. code-block:: python
**Note**:
``paddle.multiply`` supports broadcasting. If you would like to know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
import paddle
import numpy as np
Args:
x (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
y (Tensor): the input tensor, its data type should be float32, float64, int32, int64.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
paddle.disable_static()
x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
x = paddle.to_variable(x_data)
y = paddle.to_variable(y_data)
res = paddle.multiply(x, y)
print(res.numpy()) # [[5, 12], [21, 32]]
Returns:
N-D Tensor. A location into which the result is stored. Its dimension equals with $x$.
x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
y_data = np.array([1, 2], dtype=np.float32)
x = paddle.to_variable(x_data)
y = paddle.to_variable(y_data)
res = paddle.multiply(x, y, axis=1)
print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
res = paddle.multiply(x, y)
print(res.numpy()) # [[5, 12], [21, 32]]
x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
y_data = np.array([1, 2], dtype=np.float32)
x = paddle.to_tensor(x_data)
y = paddle.to_tensor(y_data)
res = paddle.multiply(x, y, axis=1)
print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
"""
op_type = 'elementwise_mul'
act = None
if x.dtype != y.dtype:
raise TypeError(
'Input tensors must be same type, but received type of x: %s, type of y: %s '
% (x.dtype, y.dtype))
if in_dygraph_mode():
return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type)
......
......@@ -21,7 +21,6 @@ from ..fluid import core, layers
from ..fluid.layers import argmin #DEFINE_ALIAS
from ..fluid.layers import has_inf #DEFINE_ALIAS
from ..fluid.layers import has_nan #DEFINE_ALIAS
from ..fluid.layers import topk #DEFINE_ALIAS
__all__ = [
'argmax',
......@@ -756,3 +755,100 @@ def masked_select(x, mask, name=None):
type='masked_select', inputs={'X': x,
'Mask': mask}, outputs={'Y': out})
return out
def topk(x, k, axis=None, largest=True, sorted=True, name=None):
"""
This OP is used to find values and indices of the k largest or smallest at the optional axis.
If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
Args:
x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
k(int, Tensor): The number of top elements to look for along the axis.
axis(int, optional): Axis to compute indices along. The effective range
is [-R, R), where R is x.ndim. when axis < 0, it works the same way
as axis + R. Default is -1.
largest(bool, optional) : largest is a flag, if set to true,
algorithm will sort by descending order, otherwise sort by
ascending order. Default is True.
sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
Examples:
.. code-block:: python
import numpy as np
import paddle
paddle.disable_static()
data_1 = np.array([1, 4, 5, 7])
tensor_1 = paddle.to_tensor(data_1)
value_1, indices_1 = paddle.topk(tensor_1, k=1)
print(value_1.numpy())
# [7]
print(indices_1.numpy())
# [3]
data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
tensor_2 = paddle.to_tensor(data_2)
value_2, indices_2 = paddle.topk(tensor_2, k=1)
print(value_2.numpy())
# [[7]
# [6]]
print(indices_2.numpy())
# [[3]
# [1]]
value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
print(value_3.numpy())
# [[7]
# [6]]
print(indices_3.numpy())
# [[3]
# [1]]
value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
print(value_4.numpy())
# [[2 6 5 7]]
print(indices_4.numpy())
# [[1 1 0 0]]
"""
if in_dygraph_mode():
k = k.numpy().item(0) if isinstance(k, Variable) else k
if axis is None:
out, indices = core.ops.top_k_v2(x, 'k',
int(k), 'largest', largest,
'sorted', sorted)
else:
out, indices = core.ops.top_k_v2(x, 'k',
int(k), 'axis', axis, 'largest',
largest, 'sorted', sorted)
return out, indices
helper = LayerHelper("top_k_v2", **locals())
inputs = {"X": [x]}
attrs = {}
if isinstance(k, Variable):
inputs['K'] = [k]
else:
attrs = {'k': k}
attrs['largest'] = largest
attrs['sorted'] = sorted
if axis is not None:
attrs['axis'] = axis
values = helper.create_variable_for_type_inference(dtype=x.dtype)
indices = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op(
type="top_k_v2",
inputs=inputs,
outputs={"Out": [values],
"Indices": [indices]},
attrs=attrs)
indices.stop_gradient = True
return values, indices
......@@ -63,12 +63,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
go get github.com/Masterminds/glide && \
rm -rf /root/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
......
......@@ -156,19 +156,19 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 --no-cache-dir install opencv-python && \
pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 --no-cache-dir install opencv-python && \
pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 --no-cache-dir install opencv-python && \
pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.8 --no-cache-dir install opencv-python && \
pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip --no-cache-dir install opencv-python
pip --no-cache-dir install opencv-python==4.2.0.32
#For docstring checker
RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
......
......@@ -89,7 +89,7 @@ function do_cpython_build {
fi
# NOTE Make libpython shared library visible to python calls below
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
cd /
ls ${MY_DIR}
local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
......
......@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU
ARG WITH_AVX
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
......@@ -199,12 +198,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release . \
make)
# ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
......
......@@ -11,7 +11,6 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU
ARG WITH_AVX
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
......@@ -212,12 +211,6 @@ RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
RUN pip --no-cache-dir install certifi urllib3[secure]
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
(cd /woboq \
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release . \
make)
# ar mishandles 4GB files
# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
......
......@@ -480,14 +480,8 @@ def get_filenames():
filename = ''
print("\nWARNING:----Exception in get api filename----\n")
print("\n" + api + ' module is ' + module + "\n")
if filename != '':
# rm contrib file
if filename.startswith(
'../python/paddle/fluid/contrib'
) or filename == '../python/paddle/verison.py':
pass
elif filename not in filenames:
filenames.append(filename)
if filename != '' and filename not in filenames:
filenames.append(filename)
# get all methods
method = ''
if inspect.isclass(eval(api)):
......@@ -557,14 +551,18 @@ def get_wlist():
'''
wlist = []
wlist_file = []
with open("wlist.json", 'r') as load_f:
load_dict = json.load(load_f)
for key in load_dict:
wlist = wlist + load_dict[key]
return wlist
if key == 'wlist_file':
wlist_file = wlist_file + load_dict[key]
else:
wlist = wlist + load_dict[key]
return wlist, wlist_file
wlist = get_wlist()
wlist, wlist_file = get_wlist()
if len(sys.argv) < 2:
print("Error: inadequate number of arguments")
......@@ -590,8 +588,14 @@ else:
if len(filenames) == 0 and len(whl_error) == 0:
print("-----API_PR.spec is the same as API_DEV.spec-----")
exit(0)
elif '../python/paddle/fluid/core_avx.py' in filenames:
filenames.remove('../python/paddle/fluid/core_avx.py')
rm_file = []
for f in filenames:
for w_file in wlist_file:
if f.startswith(w_file):
rm_file.append(f)
filenames.remove(f)
if len(rm_file) != 0:
print("REMOVE white files: %s" % rm_file)
print("API_PR is diff from API_DEV: %s" % filenames)
one_part_filenum = int(math.ceil(len(filenames) / cpus))
if one_part_filenum == 0:
......
{
"wlist_file" : [
"../python/paddle/fluid/contrib",
"../python/paddle/verison.py",
"../python/paddle/fluid/core_avx.py",
"../python/paddle/distributed"
],
"wlist_inneed":[
"append_LARS",
"BuildStrategy.debug_graphviz_path",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册