提交 557229bd 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fea/jit/rnn

......@@ -204,6 +204,7 @@ include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
include(external/rocprim)
include(external/xxhash) # download xxhash
include(external/dlpack)
include(external/snappy) # download snappy
......
......@@ -22,6 +22,27 @@ ENV HOME /root
# Add bash enhancements
COPY ./paddle/scripts/docker/root/ /root/
# Prepare packages for Python
RUN apt-get update && \
apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
xz-utils tk-dev libffi-dev liblzma-dev
# Install Python3.6
RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null
# Install Python3.7
RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
make -j8 > /dev/null && make altinstall > /dev/null
RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \
python3 python3-dev python3-pip \
......@@ -74,6 +95,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.6 install -U wheel && \
pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
pip3.7 install -U wheel && \
pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \
pip install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
......@@ -82,22 +109,34 @@ RUN pip3 install -U wheel && \
RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \
pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.6 install opencv-python && \
pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3.7 install opencv-python && \
pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
#For docstring checker
RUN pip3 install pylint pytest astroid isort
RUN pip3.6 install pylint pytest astroid isort
RUN pip3.7 install pylint pytest astroid isort
RUN pip install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/
RUN pip3 install -r /root/requirements.txt
RUN pip3.6 install -r /root/requirements.txt
RUN pip3.7 install -r /root/requirements.txt
RUN pip install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev
RUN pip3 install certifi urllib3[secure]
RUN pip3.6 install certifi urllib3[secure]
RUN pip3.7 install certifi urllib3[secure]
RUN pip install certifi urllib3[secure]
......
......@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git"
GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
......
......@@ -53,7 +53,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "21fb5f2af1dd14e132af4f1b79160977ee487818"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
if (NOT WITH_AMD_GPU)
return()
endif()
# rocprim is "ROCm Parallel Primitives" for short.
# It is a header-only library providing HIP and HC parallel primitives
# for developing performant GPU-accelerated code on AMD ROCm platform.
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "/opt/rocm/hcc")
endif()
INCLUDE(ExternalProject)
SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim)
SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
ExternalProject_Add(
extern_rocprim
GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc
PREFIX ${ROCPRIM_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
CMAKE_ARGS -DONLY_INSTALL=ON
CMAKE_ARGS -DBUILD_TEST=OFF
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
INSTALL_DIR ${ROCPRIM_INSTALL_DIR}
${EXTERNAL_PROJECT_LOG_ARGS}
)
INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";")
add_library(rocprim STATIC ${dummyfile})
else()
add_library(rocprim INTERFACE)
endif()
add_dependencies(rocprim extern_rocprim)
......@@ -129,6 +129,9 @@ set(COMMON_FLAGS
-Wno-error=parentheses-equality # Warnings in pybind11
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
-Wno-error=terminate # Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
)
set(GPU_COMMON_FLAGS
......
......@@ -454,25 +454,29 @@ function(hip_library TARGET_NAME)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
find_fluid_modules(${TARGET_NAME})
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
find_fluid_modules(${TARGET_NAME})
endif()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END)
else()
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(hip_library_SRCS)
endif()
......
......@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
endif()
include_directories("/opt/rocm/include")
include_directories("/opt/rocm/hip/include")
include_directories("/opt/rocm/miopen/include")
include_directories("/opt/rocm/hipblas/include")
include_directories("/opt/rocm/hiprand/include")
include_directories("/opt/rocm/rocrand/include")
......@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
endif(WITH_DSO)
if(WITH_DOUBLE)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
endif(WITH_DOUBLE)
if(WITH_TESTING)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
endif(WITH_TESTING)
if(WITH_DISTRIBUTE)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
endif(WITH_DISTRIBUTE)
if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
if(NOT WITH_GOLANG)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
endif(NOT WITH_GOLANG)
if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(NOT WITH_RDMA)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
endif(NOT WITH_RDMA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
......
......@@ -109,7 +109,8 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......
......@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
......@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
......
......@@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc;
// WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// WhileOp would set keep_kids to true,
// because WhileGradOp needs the scopes created in WhileOp.
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
......
......@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;
op->SetIsCalledByExecutor(false);
op->Run(*scope_, place_);
}
}
......
......@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
this->attrs_[name] = std::vector<int>();
break;
}
case proto::AttrType::LONGS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from LONGS to LONGS";
this->attrs_[name] = std::vector<int64_t>();
break;
}
case proto::AttrType::FLOATS: {
VLOG(110) << "SetAttr: " << Type() << ", " << name
<< " from INTS to FLOATS";
......
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
// Combine two hash values to a single hash.
inline size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
......@@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
#endif
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
......@@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
#ifdef PADDLE_ON_INFERENCE
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
OpKernelType::Hash()(expected_kernel_key));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
auto it = infer_transfer_scope_cache.find(infer_cache_key);
if (it != infer_transfer_scope_cache.end()) {
new_scope = infer_transfer_scope_cache[infer_cache_key];
} else {
new_scope = &scope.NewScope();
infer_transfer_scope_cache[infer_cache_key] = new_scope;
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
// over the running of operators.
// We use a thread_local cache to fix that issue, the key in the cache is
// the combination of the `scope` argument, from_kernel_type,
// target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some
// changes on this logic for this macro might not tested on the other
// scenerios.
// If this op is not called by an Executor or ParallelExecutor, it should
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
// variables, that behavior a lot different.
if (!run_by_executor_) {
new_scope = TryCreateTransferScope(kernel_type_for_var,
expected_kernel_key, &scope);
}
#endif
if (new_scope == nullptr) {
if (!new_scope) {
new_scope = &scope.NewScope();
}
......
......@@ -127,6 +127,8 @@ class OperatorBase {
//! Get all outputs variable names
virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
protected:
std::string type_;
// NOTE: in case of OpGrad, inputs_ contains:
......@@ -139,6 +141,8 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap outputs_;
AttributeMap attrs_;
// Whether this operator executes in an Executor.
bool run_by_executor_{true};
private:
void GenerateTemporaryNames();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/transfer_scope_cache.h"
namespace paddle {
namespace framework {
// Holds all the transfer scope across the process.
std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
typedef std::unordered_map<size_t, Scope*> map_t;
thread_local std::unique_ptr<map_t> x(new map_t);
return *x;
}
// Holds all the transfer scope for this thread.
std::unordered_set<Scope*>& global_transfer_scope_cache() {
typedef std::unordered_set<Scope*> set_t;
thread_local std::unique_ptr<set_t> x(new set_t);
return *x;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope) {
Scope* new_scope{nullptr};
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(scope));
auto it = global_transfer_data_cache().find(infer_cache_key);
if (it != global_transfer_data_cache().end()) {
new_scope = global_transfer_data_cache()[infer_cache_key];
} else {
new_scope = &scope->NewScope();
global_transfer_data_cache()[infer_cache_key] = new_scope;
}
global_transfer_scope_cache().insert(new_scope);
return new_scope;
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread> // NOLINT
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
std::unordered_set<Scope*>& global_transfer_scope_cache();
// Combine two hash values to a single hash.
static size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
const Scope* scope);
void RemoveKidsFromTransferScopeCache(Scope* scope);
} // namespace framework
} // namespace paddle
......@@ -4,6 +4,7 @@ endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
add_subdirectory(utils)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
......
......@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
......
......@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
use_feed_fetch_ops = other.use_feed_fetch_ops;
......@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
use_feed_fetch_ops = other.use_feed_fetch_ops;
......
......@@ -31,11 +31,11 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle {
......@@ -67,7 +67,7 @@ bool AnalysisPredictor::Init(
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (!PrepareScope(parent_scope)) {
return false;
......@@ -160,6 +160,14 @@ bool AnalysisPredictor::PrepareExecutor() {
return true;
}
void AnalysisPredictor::SetMkldnnThreadID(int tid) {
#ifdef PADDLE_WITH_MKLDNN
platform::set_cur_thread_id(tid);
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
#endif
}
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data,
int batch_size) {
......@@ -167,7 +175,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
inference::Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
......@@ -208,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<float>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
} else {
#ifdef PADDLE_WITH_CUDA
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(),
0); // stream 0 for sync copy
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {
......
......@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
framework::Scope *scope() { return scope_.get(); }
framework::ProgramDesc &program() { return *inference_program_; }
void SetMkldnnThreadID(int tid);
protected:
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
......
......@@ -24,11 +24,11 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace {
......@@ -76,7 +76,7 @@ bool NativePaddlePredictor::Init(
#endif
// no matter with or without MKLDNN
paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
......@@ -139,7 +139,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
Timer timer;
timer.tic();
// set feed variable
std::vector<framework::LoDTensor> feeds;
framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
if (!SetFeed(inputs, scope)) {
LOG(ERROR) << "fail to set feed";
......@@ -195,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::DDim ddim = framework::make_ddim(inputs[i].shape);
void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
input_ptr = input.mutable_data<float>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.length());
} else {
#ifdef PADDLE_WITH_CUDA
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(),
0); // stream 0 for sync copy
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework::LoD lod;
for (auto &level : inputs[i].lod) {
......
......@@ -46,8 +46,6 @@ if(WITH_GPU)
endif()
endif(NOT WIN32)
endif()
include_directories("D:/Paddle/")
include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
......
......@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
int max_batch_size = 1);
bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; }
friend class ::paddle::AnalysisPredictor;
......
......@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool specify_input_name{false};
// Set and get the number of cpu math library threads.
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
}
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
protected:
// number of cpu math library (such as MKL, OpenBlas) threads for each
// instance.
int cpu_math_library_num_threads_{1};
};
// A factory to help create different predictors.
......
......@@ -19,9 +19,6 @@ namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SplitOp.
*/
class SplitOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
......@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
int axis = boost::get<int>(op_desc.GetAttr("axis"));
std::vector<int> output_lengths =
boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
// split on batch is not supported in TensorRT
PADDLE_ENFORCE(axis != 0);
if (axis < 0) {
axis += input_dims.nbDims;
} else {
axis -= 1;
}
axis += (axis < 0) ? input_dims.nbDims : -1;
PADDLE_ENFORCE(output_lengths.size() == output_num);
//
plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
......
......@@ -20,30 +20,92 @@ namespace paddle {
namespace inference {
namespace tensorrt {
TEST(split_op, test) {
template <int BatchSize, int Axis>
void TensorRTSplitTest(const std::vector<int> &in_shape,
const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
auto make_dim = [](const std::vector<int> &shape) {
nvinfer1::DimsCHW dim;
dim.c() = shape[0];
dim.h() = shape[1];
dim.w() = shape[2];
return dim;
};
validator.DeclInputVar("split_input", make_dim(in_shape));
std::vector<std::string> output_vars;
for (size_t i = 0; i < sections.size(); ++i) {
auto out_shape = in_shape;
out_shape[Axis - 1] = sections[i];
std::string output_name = "split_out" + std::to_string(i);
validator.DeclOutputVar(output_name, make_dim(out_shape));
output_vars.push_back(output_name);
}
// Prepare Op description
framework::OpDesc desc;
desc.SetType("split");
desc.SetInput("X", {"split_input"});
desc.SetOutput("Out", {"split_out1", "split_out2"});
desc.SetOutput("Out", output_vars);
int num = 0;
int axis = 1;
std::vector<int> output_lengths = {2, 1};
desc.SetAttr("axis", axis);
desc.SetAttr("num", num);
desc.SetAttr("sections", output_lengths);
desc.SetAttr("axis", Axis);
desc.SetAttr("num", 0);
desc.SetAttr("sections", sections);
validator.SetOp(*desc.Proto());
validator.Execute(1);
validator.Execute(BatchSize);
}
// batch = 0, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch1) {
TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2});
}
// batch = 0, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch1) {
TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1});
}
// batch = 10, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch10) {
TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2});
}
// batch = 10, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch10) {
TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1});
}
// batch = 0, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch1) {
TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2});
}
// batch = 0, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch1) {
TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1});
}
// batch = 10, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch10) {
TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2});
}
// batch = 10, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch10) {
TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1});
}
// batch = 0, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch1) {
TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2});
}
// batch = 0, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch1) {
TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1});
}
// batch = 10, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch10) {
TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2});
}
// batch = 10, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch10) {
TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1});
}
} // namespace tensorrt
......
......@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace paddle {
......@@ -19,6 +21,52 @@ namespace inference {
namespace tensorrt {
namespace plugin {
// copied from operators::math::SplitFunctor
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int* out_cols,
int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int curr_segment = 0;
int curr_offset = out_cols[0];
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
int curr_col_offset = out_cols[curr_segment + 1];
while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
curr_col_offset = out_cols[curr_segment + 1];
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs_data[curr_segment];
if (output_ptr != nullptr) {
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] =
input_data[tid_y * in_col + tid_x];
}
}
}
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int fixed_out_col,
T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
int split = tid_x / fixed_out_col;
int in_offset = tid_x - split * fixed_out_col;
T* output_ptr = outputs_data[split];
if (output_ptr != nullptr) {
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * fixed_out_col + in_offset] =
input_data[tid_y * in_col + tid_x];
}
}
}
nvinfer1::Dims SplitPlugin::getOutputDimensions(
int index, const nvinfer1::Dims* input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(num_inputs, 1);
......@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
int SplitPlugin::initialize() {
PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
// notice input dims is [C, H, W]
nvinfer1::Dims dims = this->getInputDims(0);
outer_rows_ = 1;
inner_cols_ = 1;
for (int i = 0; i < axis_; ++i) {
outer_rows_ *= dims.d[i];
}
for (int i = axis_ + 1; i < dims.nbDims; ++i) {
inner_cols_ *= dims.d[i];
}
same_shape_ = true;
std::vector<int> segment_offsets(1, 0);
for (int i = 0; i < this->getNbOutputs(); ++i) {
segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
if (output_length_[i] != output_length_[0]) {
same_shape_ = false;
}
segment_offsets.push_back(segment_offsets.back() +
output_length_[i] * inner_cols_);
}
segment_offsets_ = segment_offsets;
nvinfer1::Dims dims = this->getInputDims(0);
nx_ = 1;
for (int i = dims.nbDims - 1; i > axis_; --i) {
nx_ *= dims.d[i];
inner_cols_ *= dims.d[axis_];
d_segment_offsets_ = segment_offsets;
segment_offsets_ = std::move(segment_offsets);
d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
return 0;
}
template <typename T>
inline void Split(cudaStream_t stream, const bool same_shape,
const int outer_rows, const int inner_cols,
const std::vector<int>& segment_offsets,
const int* d_segment_offsets, const T* input, T** outputs) {
const int kThreadsPerBlock = 1024;
const int kMaxBlocks = 65535;
int block_cols = kThreadsPerBlock;
if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((inner_cols + 31) >> 5) << 5;
}
ny_ = dims.d[axis_];
nz_ = 1;
for (int i = axis_ - 1; i >= 0; --i) {
nz_ *= dims.d[i];
int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1);
int grid_cols =
std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
int grid_rows =
std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (same_shape) {
SplitKernel<<<grid_size, block_size, 0, stream>>>(
input, outer_rows, inner_cols, segment_offsets[1], outputs);
} else {
SplitKernel<<<grid_size, block_size, 0, stream>>>(
input, outer_rows, inner_cols, d_segment_offsets,
static_cast<int>(segment_offsets.size()), outputs);
}
return 0;
}
int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
void** outputs, void* workspace, cudaStream_t stream) {
auto const& input_dims = this->getInputDims(0);
int input_size = 0;
float const* idata = reinterpret_cast<float const*>(inputs[0]);
float** odatas = reinterpret_cast<float**>(outputs);
// kernel impl here.
int inputBatchOffset = nx_ * ny_ * nz_;
for (size_t i = 0; i < this->getNbOutputs(); i++) {
for (size_t j = 0; j < batchSize; j++) {
cudaMemcpyAsync(
odatas[i] +
j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
sizeof(float),
inputs[0] +
(inputBatchOffset * j + segment_offsets_[i] * nx_) *
sizeof(float),
(segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
cudaMemcpyDeviceToDevice, stream);
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
if (((batchSize == 1 && axis_ == 0) || axis_ == -1) &&
this->getNbOutputs() < 10) {
float** output_ptrs = reinterpret_cast<float**>(outputs);
int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
? sizeof(float)
: sizeof(__half);
for (int i = 0; i < this->getNbOutputs(); ++i) {
PADDLE_ENFORCE(
cudaMemcpyAsync(
output_ptrs[i], input_ptr + segment_offsets_[i],
(segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
}
} else {
outer_rows_ *= batchSize;
const int* d_segment_offsets_ptr =
thrust::raw_pointer_cast(&d_segment_offsets_[0]);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
this->getNbOutputs() * sizeof(float*),
cudaMemcpyHostToDevice,
stream) == cudaSuccess);
if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
d_segment_offsets_ptr, input_ptr, output_ptrs);
} else {
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT
(__half**)output_ptrs); // NOLINT
}
}
return cudaGetLastError() != cudaSuccess;
}
......
......@@ -14,6 +14,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
......@@ -25,7 +26,7 @@ namespace plugin {
class SplitPlugin : public PluginTensorRT {
public:
SplitPlugin(int axis, std::vector<int> const &output_lengths)
: axis_(axis), output_length_(output_lengths) {}
: axis_(axis), same_shape_(true), output_length_(output_lengths) {}
SplitPlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length);
......@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
}
int axis_;
int outer_rows_;
int inner_cols_;
bool same_shape_;
std::vector<int> output_length_;
int nx_, ny_, nz_;
std::vector<int> segment_offsets_;
thrust::device_vector<int> d_segment_offsets_;
thrust::device_vector<float *> d_output_ptrs_;
};
} // namespace plugin
......
......@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
......@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
# anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# anakin rnn1
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL)
# anakin mobilenet
if(WITH_GPU)
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
# anakin rnn1
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL)
# anakin mobilenet
if(WITH_GPU)
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
endif()
if(WITH_GPU AND TENSORRT_FOUND)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
endif()
......@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n";
os << GenSpaces(num_spaces)
<< "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
......
......@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode.");
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace inference {
......@@ -206,22 +207,23 @@ void TestMultiThreadPrediction(
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis));
for (int tid = 1; tid < num_threads; ++tid) {
predictors.emplace_back(predictors.front()->Clone());
}
auto main_predictor = CreateTestPredictor(config, use_analysis);
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
#ifdef PADDLE_WITH_MKLDNN
platform::set_cur_thread_id(static_cast<int>(tid) + 1);
#endif
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<PaddleTensor> outputs_tid;
auto &predictor = predictors[tid];
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto predictor = main_predictor->Clone();
#ifdef PADDLE_WITH_MKLDNN
if (use_analysis) {
static_cast<AnalysisPredictor *>(predictor.get())
->SetMkldnnThreadID(static_cast<int>(tid) + 1);
}
#endif
// warmup run
LOG(INFO) << "Running thread " << tid << ", warm up run...";
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
std::string Benchmark::SerializeToString() const {
std::stringstream ss;
ss << "-----------------------------------------------------\n";
ss << "name\t";
ss << "batch_size\t";
ss << "num_threads\t";
ss << "latency\t";
ss << "qps";
ss << '\n';
ss << name_ << "\t";
ss << batch_size_ << "\t";
ss << num_threads_ << "\t";
ss << latency_ << "\t";
ss << 1000 / latency_;
ss << '\n';
return ss.str();
}
void Benchmark::PersistToFile(const std::string &path) const {
std::ofstream file(path, std::ios::app);
PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path);
file << SerializeToString();
file.flush();
file.close();
}
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
namespace paddle {
namespace inference {
/*
* Helper class to calculate the performance.
*/
struct Benchmark {
int batch_size() const { return batch_size_; }
void SetBatchSize(int x) { batch_size_ = x; }
int num_threads() const { return num_threads_; }
void SetNumThreads(int x) { num_threads_ = x; }
bool use_gpu() const { return use_gpu_; }
void SetUseGpu() { use_gpu_ = true; }
int latency() const { return latency_; }
void SetLatency(int x) { latency_ = x; }
const std::string& name() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
std::string SerializeToString() const;
void PersistToFile(const std::string& path) const;
private:
bool use_gpu_{false};
int batch_size_{0};
int latency_;
int num_threads_{1};
std::string name_;
};
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
using namespace paddle::inference;
TEST(Benchmark, basic) {
Benchmark benchmark;
benchmark.SetName("key0");
benchmark.SetBatchSize(10);
benchmark.SetUseGpu();
benchmark.SetLatency(220);
LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
}
TEST(Benchmark, PersistToFile) {
Benchmark benchmark;
benchmark.SetName("key0");
benchmark.SetBatchSize(10);
benchmark.SetUseGpu();
benchmark.SetLatency(220);
benchmark.PersistToFile("1.log");
benchmark.PersistToFile("1.log");
benchmark.PersistToFile("1.log");
}
\ No newline at end of file
......@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t thread_num = 32;
size_t sleep_time = 40;
size_t extra_time = 2;
size_t extra_time = 10;
// Reserve to perform more tests in the future
std::vector<std::shared_ptr<Allocator>> allocators;
......
......@@ -22,7 +22,7 @@ iou_similarity_op.cu)
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
detection_library(anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu)
detection_library(target_assign_op SRCS target_assign_op.cc
......
......@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
auto fixed_sizes = ctx->Attrs().Get<std::vector<float>>("fixed_sizes");
auto fixed_ratios = ctx->Attrs().Get<std::vector<float>>("fixed_ratios");
auto densities = ctx->Attrs().Get<std::vector<int>>("densities");
bool flatten = ctx->Attrs().Get<bool>("flatten_to_2d");
PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(),
"The number of fixed_sizes and densities must be equal.");
size_t num_priors = 0;
if ((fixed_sizes.size() > 0) && (densities.size() > 0)) {
for (size_t i = 0; i < densities.size(); ++i) {
if (fixed_ratios.size() > 0) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
}
for (size_t i = 0; i < densities.size(); ++i) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
if (!flatten) {
std::vector<int64_t> dim_vec(4);
dim_vec[0] = input_dims[2];
dim_vec[1] = input_dims[3];
dim_vec[2] = num_priors;
dim_vec[3] = 4;
ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
} else {
int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
ctx->SetOutputDim("Boxes", {dim0, 4});
ctx->SetOutputDim("Variances", {dim0, 4});
}
std::vector<int64_t> dim_vec(4);
dim_vec[0] = input_dims[2];
dim_vec[1] = input_dims[3];
dim_vec[2] = num_priors;
dim_vec[3] = 4;
ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
}
protected:
......@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
platform::CPUPlace());
ctx.GetPlace());
}
};
......@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
.SetDefault(true);
AddAttr<bool>("flatten_to_2d",
"(bool) Whether to flatten to 2D and "
"the second dim is 4.")
.SetDefault(false);
AddAttr<float>(
"step_w",
"Density prior boxes step across width, 0.0 for auto calculation.")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/density_prior_box_op.h"
namespace paddle {
namespace operators {
template <typename T>
static __device__ inline T Clip(T in) {
return min(max(in, 0.), 1.);
}
template <typename T>
static __global__ void GenDensityPriorBox(
const int height, const int width, const int im_height, const int im_width,
const T offset, const T step_width, const T step_height,
const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
int gidx = blockIdx.x * blockDim.x + threadIdx.x;
int gidy = blockIdx.y * blockDim.y + threadIdx.y;
int step_x = blockDim.x * gridDim.x;
int step_y = blockDim.y * gridDim.y;
const T* width_ratio = ratios_shift;
const T* height_ratio = ratios_shift + num_priors;
const T* width_shift = ratios_shift + 2 * num_priors;
const T* height_shift = ratios_shift + 3 * num_priors;
for (int j = gidy; j < height; j += step_y) {
for (int i = gidx; i < width * num_priors; i += step_x) {
int h = j;
int w = i / num_priors;
int k = i % num_priors;
T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height;
T center_x_temp = center_x + width_shift[k];
T center_y_temp = center_y + height_shift[k];
T box_width_ratio = width_ratio[k] / 2.;
T box_height_ratio = height_ratio[k] / 2.;
T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.);
T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.);
T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.);
T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.);
int out_offset = (j * width * num_priors + i) * 4;
out[out_offset] = is_clip ? Clip<T>(xmin) : xmin;
out[out_offset + 1] = is_clip ? Clip<T>(ymin) : ymin;
out[out_offset + 2] = is_clip ? Clip<T>(xmax) : xmax;
out[out_offset + 3] = is_clip ? Clip<T>(ymax) : ymax;
var[out_offset] = var_xmin;
var[out_offset + 1] = var_ymin;
var[out_offset + 2] = var_xmax;
var[out_offset + 3] = var_ymax;
}
}
}
template <typename T>
class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* image = ctx.Input<paddle::framework::Tensor>("Image");
auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto variances = ctx.Attr<std::vector<float>>("variances");
auto is_clip = ctx.Attr<bool>("clip");
auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
auto densities = ctx.Attr<std::vector<int>>("densities");
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
T offset = static_cast<T>(ctx.Attr<float>("offset"));
auto img_width = image->dims()[3];
auto img_height = image->dims()[2];
auto feature_width = input->dims()[3];
auto feature_height = input->dims()[2];
T step_width, step_height;
if (step_w == 0 || step_h == 0) {
step_width = static_cast<T>(img_width) / feature_width;
step_height = static_cast<T>(img_height) / feature_height;
} else {
step_width = step_w;
step_height = step_h;
}
int num_priors = 0;
for (size_t i = 0; i < densities.size(); ++i) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
int step_average = static_cast<int>((step_width + step_height) * 0.5);
framework::Tensor h_temp;
T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
int idx = 0;
for (size_t s = 0; s < fixed_sizes.size(); ++s) {
auto fixed_size = fixed_sizes[s];
int density = densities[s];
for (size_t r = 0; r < fixed_ratios.size(); ++r) {
float ar = fixed_ratios[r];
int shift = step_average / density;
float box_width_ratio = fixed_size * sqrt(ar);
float box_height_ratio = fixed_size / sqrt(ar);
for (int di = 0; di < density; ++di) {
for (int dj = 0; dj < density; ++dj) {
float center_x_temp = shift / 2. + dj * shift - step_average / 2.;
float center_y_temp = shift / 2. + di * shift - step_average / 2.;
tdata[idx] = box_width_ratio;
tdata[num_priors + idx] = box_height_ratio;
tdata[2 * num_priors + idx] = center_x_temp;
tdata[3 * num_priors + idx] = center_y_temp;
idx++;
}
}
}
}
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor d_temp;
framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp);
// At least use 32 threads, at most 512 threads.
// blockx is multiple of 32.
int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L);
int gridx = (feature_width * num_priors + blockx - 1) / blockx;
dim3 threads(blockx, 1);
dim3 grids(gridx, feature_height);
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(
feature_height, feature_width, img_height, img_width, offset,
step_width, step_height, num_priors, d_temp.data<T>(), is_clip,
variances[0], variances[1], variances[2], variances[3],
boxes->data<T>(), vars->data<T>());
}
}; // namespace operators
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(density_prior_box,
ops::DensityPriorBoxOpCUDAKernel<float>,
ops::DensityPriorBoxOpCUDAKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
......@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
step_height = step_h;
}
int num_priors = 0;
if (fixed_sizes.size() > 0 && densities.size() > 0) {
for (size_t i = 0; i < densities.size(); ++i) {
if (fixed_ratios.size() > 0) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
}
for (size_t i = 0; i < densities.size(); ++i) {
num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
}
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
auto box_dim = vars->dims();
boxes->Resize({feature_height, feature_width, num_priors, 4});
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes).setConstant(0.0);
int step_average = static_cast<int>((step_width + step_height) * 0.5);
for (int h = 0; h < feature_height; ++h) {
......@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
auto fixed_size = fixed_sizes[s];
int density = densities[s];
// Generate density prior boxes with fixed ratios.
if (fixed_ratios.size() > 0) {
for (size_t r = 0; r < fixed_ratios.size(); ++r) {
float ar = fixed_ratios[r];
int shift = step_average / density;
float box_width_ratio = fixed_size * sqrt(ar);
float box_height_ratio = fixed_size / sqrt(ar);
for (int di = 0; di < density; ++di) {
for (int dj = 0; dj < density; ++dj) {
float center_x_temp =
center_x - step_average / 2. + shift / 2. + dj * shift;
float center_y_temp =
center_y - step_average / 2. + shift / 2. + di * shift;
e_boxes(h, w, idx, 0) =
(center_x_temp - box_width_ratio / 2.) / img_width >= 0
? (center_x_temp - box_width_ratio / 2.) / img_width
: 0;
e_boxes(h, w, idx, 1) =
(center_y_temp - box_height_ratio / 2.) / img_height >= 0
? (center_y_temp - box_height_ratio / 2.) / img_height
: 0;
e_boxes(h, w, idx, 2) =
(center_x_temp + box_width_ratio / 2.) / img_width <= 1
? (center_x_temp + box_width_ratio / 2.) / img_width
: 1;
e_boxes(h, w, idx, 3) =
(center_y_temp + box_height_ratio / 2.) / img_height <= 1
? (center_y_temp + box_height_ratio / 2.) / img_height
: 1;
idx++;
}
for (size_t r = 0; r < fixed_ratios.size(); ++r) {
float ar = fixed_ratios[r];
int shift = step_average / density;
float box_width_ratio = fixed_size * sqrt(ar);
float box_height_ratio = fixed_size / sqrt(ar);
for (int di = 0; di < density; ++di) {
for (int dj = 0; dj < density; ++dj) {
float center_x_temp =
center_x - step_average / 2. + shift / 2. + dj * shift;
float center_y_temp =
center_y - step_average / 2. + shift / 2. + di * shift;
e_boxes(h, w, idx, 0) =
(center_x_temp - box_width_ratio / 2.) / img_width >= 0
? (center_x_temp - box_width_ratio / 2.) / img_width
: 0;
e_boxes(h, w, idx, 1) =
(center_y_temp - box_height_ratio / 2.) / img_height >= 0
? (center_y_temp - box_height_ratio / 2.) / img_height
: 0;
e_boxes(h, w, idx, 2) =
(center_x_temp + box_width_ratio / 2.) / img_width <= 1
? (center_x_temp + box_width_ratio / 2.) / img_width
: 1;
e_boxes(h, w, idx, 3) =
(center_y_temp + box_height_ratio / 2.) / img_height <= 1
? (center_y_temp + box_height_ratio / 2.) / img_height
: 1;
idx++;
}
}
}
......@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
vars->Resize(var_dim);
boxes->Resize(box_dim);
}
}; // namespace operators
......
......@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(rpc_disable_reuse_port);
namespace paddle {
namespace operators {
namespace distributed {
......@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
// Channel configurations:
grpc::ChannelArguments args;
args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
if (FLAGS_rpc_disable_reuse_port) {
args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
}
args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
......
......@@ -20,6 +20,8 @@ limitations under the License. */
using ::grpc::ServerAsyncResponseWriter;
DECLARE_bool(rpc_disable_reuse_port);
namespace paddle {
namespace operators {
namespace distributed {
......@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
VLOG(40) << "AsyncGRPCServer WaitSeverReady";
}
// Define an option subclass in order to disable SO_REUSEPORT for the
// server socket.
// Come from:
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
class NoReusePortOption : public ::grpc::ServerBuilderOption {
public:
void UpdateArguments(::grpc::ChannelArguments* args) override {
args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
}
void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
plugins) override {}
};
void AsyncGRPCServer::StartServer() {
::grpc::ServerBuilder builder;
builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
......@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
if (FLAGS_rpc_disable_reuse_port) {
builder.SetOption(
std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
}
builder.RegisterService(&service_);
for (auto t : rpc_call_map_) {
......
......@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
namespace paddle {
namespace operators {
namespace distributed {
......
include(operators)
register_operators()
register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
if (WITH_GPU)
op_library(fusion_transpose_flatten_concat_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
"Inputs(X) of ConcatOp should be empty.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ConcatOp should not be null.");
auto ins = ctx->GetInputsDim("X");
const size_t n = ins.size();
PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
std::vector<int> trans_axis =
ctx->Attrs().Get<std::vector<int>>("trans_axis");
int flatten_axis = ctx->Attrs().Get<int>("flatten_axis");
int concat_axis = ctx->Attrs().Get<int>("concat_axis");
size_t x_rank = ins[0].size();
size_t trans_axis_size = trans_axis.size();
PADDLE_ENFORCE_EQ(x_rank, trans_axis_size,
"The input tensor's rank(%d) "
"should be equal to the permutation axis's size(%d)",
x_rank, trans_axis_size);
auto dims0 =
GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[0]));
std::vector<int> out_dims(dims0);
for (size_t i = 1; i < n; i++) {
auto dimsi =
GetFlattenShape(flatten_axis, GetPermuteShape(trans_axis, ins[i]));
for (int j = 0; j < static_cast<int>(dims0.size()); j++) {
if (j == concat_axis) {
out_dims[concat_axis] += dimsi[j];
} else {
PADDLE_ENFORCE_EQ(out_dims[j], dimsi[j],
"After flatting, the %d-th dim should be save "
"except the specify axis.",
j);
}
}
}
if (out_dims[concat_axis] < 0) {
out_dims[concat_axis] = -1;
}
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
}
};
class TransposeFlattenConcatFusionOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor) The input tensor, tensors with rank up to 6 are supported.")
.AsDuplicable();
AddOutput("Out", "(Tensor)The output tensor.");
AddAttr<std::vector<int>>(
"trans_axis",
"(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given.");
AddAttr<int>("flatten_axis",
"(int)"
"Indicate up to which input dimensions (exclusive) should be"
"flattened to the outer dimension of the output. The value"
"for axis must be in the range [0, R], where R is the rank of"
"the input tensor. When axis = 0, the shape of the output"
"tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
"input tensor is (d_0, d_1, ... d_n).");
AddAttr<int>("concat_axis",
"The axis along which the input tensors will be concatenated. "
"It should be 0 or 1, since the tensor is 2D after flatting.");
AddComment(R"DOC(
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_transpose_flatten_concat,
ops::TransposeFlattenConcatFusionOp,
ops::TransposeFlattenConcatFusionOpMaker,
paddle::framework::EmptyGradOpMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto odims = out->dims();
std::vector<int> trans_axis = ctx.Attr<std::vector<int>>("trans_axis");
int flatten_axis = ctx.Attr<int>("flatten_axis");
int concat_axis = ctx.Attr<int>("concat_axis");
int rank = ins[0]->dims().size();
// use at least 4D in cudnnTransformTensor
int max_dim = rank < 4 ? 4 : rank;
std::vector<int> stride_x(max_dim, 0);
std::vector<int> stride_y(max_dim, 0);
std::vector<int> dims_y(max_dim, 0);
cudnnTensorDescriptor_t in_desc;
cudnnTensorDescriptor_t out_desc;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
T* odata = out->data<T>();
for (size_t k = 0; k < ins.size(); ++k) {
auto perm_shape = GetPermuteShape(trans_axis, ins[k]->dims());
int osize = 1;
auto idims = ins[k]->dims();
for (int i = 0; i < rank; i++) {
stride_x[i] = 1;
for (int j = trans_axis[i] + 1; j < rank; j++) {
stride_x[i] *= idims[j];
}
dims_y[i] = perm_shape[i];
osize *= perm_shape[i];
}
stride_y[rank - 1] = 1;
for (int i = rank - 2; i >= 0; i--) {
if (((i + 1) == flatten_axis) && (concat_axis == 1)) {
stride_y[i] = odims[1];
} else {
stride_y[i] = stride_y[i + 1] * perm_shape[i + 1];
}
}
// Since concat is aftern flatten, the output is 2D tensor.
// If concat_axis is 0, each input's permutated tensor is continuous.
// If concat_axis is 1, the stride of 0-th dim of each input's
// permutated tensor is odims()[1].
for (int i = rank; i < max_dim; i++) {
stride_x[i] = 1;
stride_y[i] = 1;
dims_y[i] = 1;
}
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
handle, CudnnDataType<T>::kOne(), in_desc,
static_cast<const void*>(ins[k]->data<T>()),
CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
if (concat_axis == 0) {
odata += osize;
} else {
auto flat_shape = GetFlattenShape(flatten_axis, perm_shape);
odata += flat_shape[1];
}
}
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(fusion_transpose_flatten_concat,
ops::TransposeFlattenConcatFusionKernel<float>,
ops::TransposeFlattenConcatFusionKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace paddle {
namespace operators {
inline std::vector<int32_t> GetPermuteShape(const std::vector<int>& axis,
const framework::DDim& in_dims) {
std::vector<int32_t> out_dims(in_dims.size());
for (size_t i = 0; i < axis.size(); i++) {
out_dims[i] = in_dims[axis[i]];
}
return out_dims;
}
inline std::vector<int32_t> GetFlattenShape(const int axis,
const std::vector<int>& in_dims) {
int64_t outer = 1, inner = 1;
for (int i = 0; i < static_cast<int>(in_dims.size()); ++i) {
if (i < axis) {
outer *= in_dims[i];
} else {
inner *= in_dims[i];
}
}
std::vector<int32_t> out_shape(2);
out_shape[0] = outer;
out_shape[1] = inner;
return out_shape;
}
} // namespace operators
} // namespace paddle
......@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
framework::proto::VarType::FP32,
"The sparse table only support FP32");
w_t->Get(ids_t, out_t, true, is_test);
out_t->set_lod(ids_t.lod());
}
};
......
......@@ -17,8 +17,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace operators {
namespace math {
......@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.template Get<jitkernel::VAddKernel<T>>(N);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#pragma omp parallel for
#endif
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
......
......@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
math::scatter::MergeAdd<DeviceContext, T> merge_add;
merge_add(context.template device_context<DeviceContext>(), inputs,
out);
out->SyncIndex();
} else {
// no data, just set a empty out tensor.
out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
......
......@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_inx_dim[0] = inx.size();
out_inx.Resize(out_inx_dim);
auto &local_scope = scope.NewScope();
std::string var_name = "out_index";
framework::Variable *tmp_index_var =
const_cast<framework::Scope &>(scope).Var(var_name);
framework::Variable *tmp_index_var = local_scope.Var(var_name);
auto &tmp_index_tensor =
*(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
tmp_index_tensor.Resize(out_inx_dim);
......@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_dims[axis] = out_dim_sum;
out.Resize(out_dims);
LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
// Invoke Reshape Op
LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names);
// Invoke concat Op
auto concat_op = framework::OpRegistry::CreateOp(
"concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
concat_op->Run(scope, place);
concat_op->Run(local_scope, place);
}
};
......
......@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
platform::dynload::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(num_threads);
omp_set_num_threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
......
......@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.1
#if CUDA_VERSION >= 9010
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
......
......@@ -5,8 +5,8 @@ if(WITH_PYTHON)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS}
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
DEPS ARCHIVE_START ${PYBIND_DEPS}
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
else()
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
......
......@@ -94,6 +94,30 @@ function cmake_gen() {
else
exit 1
fi
elif [ "$1" == "cp36-cp36m" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
else
exit 1
fi
elif [ "$1" == "cp37-cp37m" ]; then
if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
else
exit 1
fi
fi
else
if [ "$1" != "" ]; then
......@@ -116,6 +140,18 @@ function cmake_gen() {
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
elif [ "$1" == "cp36-cp36m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
elif [ "$1" == "cp37-cp37m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
fi
fi
fi
......@@ -419,7 +455,7 @@ function assert_api_not_changed() {
source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
if [ "$1" == "cp35-cp35m" ]; then
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
# Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' new.spec
sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
......
......@@ -28,9 +28,14 @@ int main(int argc, char** argv) {
for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
#elif __clang__
new_argv.push_back(
strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"));
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
#else
new_argv.push_back(
strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
......
......@@ -91,6 +91,7 @@ def __bootstrap__():
"""
import sys
import os
import platform
from . import core
in_test = 'unittest' in sys.modules
......@@ -110,14 +111,17 @@ def __bootstrap__():
print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system()
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
'eager_delete_tensor_gb', 'allocator_strategy',
'reader_queue_speed_test_mode', 'print_sub_graph_dir'
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
if os.name != 'nt':
read_env_flags.append('warpctc_dir')
read_env_flags.append('cpu_deterministic')
......@@ -129,6 +133,7 @@ def __bootstrap__():
read_env_flags.append('rpc_send_thread_num')
read_env_flags.append('rpc_get_thread_num')
read_env_flags.append('rpc_prefetch_thread_num')
read_env_flags.append('rpc_disable_reuse_port')
if core.is_compiled_with_cuda():
read_env_flags += [
......
......@@ -13,8 +13,10 @@
# limitations under the License.
from __future__ import print_function
from . import lookup_table_utils
from .lookup_table_utils import *
from . import hdfs_utils
from .hdfs_utils import *
__all__ = lookup_table_utils.__all__
__all__ = hdfs_utils.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import time
import logging
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid import io
from paddle.fluid import Program
__all__ = [
"load_inference_model", "load_persistable_vars",
"convert_dist_to_sparse_program"
]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
_logger = logging.getLogger("lookup_table_utils")
_logger.setLevel(logging.INFO)
model_filename = "__model__"
lookup_table_dir = "__lookup_table__"
def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
main_program.global_block()._insert_op(
index=idx,
type="lookup_sparse_table",
inputs={"Ids": [ids],
"W": [w]},
outputs={"Out": [out]},
attrs={
"is_distributed": False,
"is_sparse": True,
"grad_inplace": False
})
def __get_prefetch_op_tuples(main_program):
# current lookup tables op is split_ids->prefetch->merge_ids
prefetch_op_tuples = None
op_types = [op.type for op in main_program.global_block().ops]
for i in range(len(op_types)):
if op_types[i] == "prefetch":
if op_types[i - 1] == "split_ids" and op_types[i +
1] == "merge_ids":
split_ids_op_id = i - 1
split_ids_inputs = main_program.global_block().ops[i - 1].input(
"Ids")
prefetch_op_inputs = main_program.global_block().ops[i].input(
"X")
prefetch_op_outputs = main_program.global_block().ops[i].output(
"Out")
merge_ids_outputs = main_program.global_block().ops[
i + 1].output("Out")
need_delete_vars = []
need_delete_vars.extend(prefetch_op_inputs)
need_delete_vars.extend(prefetch_op_outputs)
prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
merge_ids_outputs, need_delete_vars)
break
return prefetch_op_tuples
def convert_dist_to_sparse_program(main_program):
if not main_program._distributed_lookup_table:
_logger.warn(
"There are no distributed lookup tables need to be converted")
return
# create table param and grad var in pserver program
origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table)
emb_var = main_program._distributed_lookup_table
main_program.global_block()._rename_var(emb_var, origin_emb_var)
origin_param_var = main_program.global_block().vars[origin_emb_var]
param_var = main_program.global_block().create_var(
name=emb_var,
shape=origin_param_var.shape,
dtype=origin_param_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
# parameter must be selected rows
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
main_program._sync_with_cpp()
prefetch_op_tuples = __get_prefetch_op_tuples(main_program)
split_ids_id = prefetch_op_tuples[0]
for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
main_program.global_block()._remove_op(idx)
main_program.desc.flush()
in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
for in_out_pair in in_out_pairs:
idx = split_ids_id
ids = main_program.global_block().vars[in_out_pair[0]]
out = main_program.global_block().vars[in_out_pair[1]]
__insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out)
main_program.desc.flush()
return main_program
def load_persistable_vars(executor, dirname, program, lookup_table_var):
def _is_checkpoint_var(exclude_fluid_vars=None):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if exclude_fluid_vars is None:
exclude_fluid_vars = []
def is_valid(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
if "tmp_" in var.name:
return False
if var.name in exclude_fluid_vars:
return False
return var.persistable
return is_valid
def _load_lookup_table_vars(executor, dirname, main_program,
lookup_table_vars):
if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname)
lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
emb_var_name = lookup_table_vars[0]
emb_var = main_program.global_block().var(emb_var_name)
emb_files = []
for emb_name in os.listdir(lookup_table_dirname):
if emb_var_name in emb_name:
emb_files.append(emb_name)
convert_program = Program()
global_block = convert_program.global_block()
emb_var = global_block.create_var(
name=emb_var.name,
shape=emb_var.shape,
dtype=emb_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
sums = []
for i, emb_file in enumerate(emb_files):
var_name = "{}_{}".format(emb_var.name, i)
param_var = global_block.create_var(
name=var_name,
shape=emb_var.shape,
dtype=emb_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
global_block.append_op(
type='load',
inputs={},
outputs={'Out': [param_var]},
attrs={
'file_path': os.path.join(lookup_table_dirname, var_name)
})
sums.append(param_var)
global_block.append_op(
type='sum', inputs={"X": sums}, outputs={'Out': emb_var}, attrs={})
global_block.append_op(type='delete_var', inputs={'X': sums})
executor.run(convert_program)
_logger.info("Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
lookup_table_vars = [lookup_table_var]
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
_load_lookup_table_vars(executor, dirname, program, lookup_table_vars)
_logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime()))
def load_inference_model(dirname, executor, lookup_table_var_name):
if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname)
local_model = os.path.join(dirname, model_filename)
with open(local_model, "rb") as f:
program_desc_str = f.read()
program = Program.parse_from_string(program_desc_str)
if not core._is_program_version_supported(program._version()):
raise ValueError("Unsupported program version: %d\n" %
program._version())
# Binary data also need version.
load_persistable_vars(executor, dirname, program, lookup_table_var_name)
feed_target_names = program.desc.get_feed_target_names()
fetch_target_names = program.desc.get_fetch_target_names()
fetch_targets = [
program.global_block().var(name) for name in fetch_target_names
]
return [program, feed_target_names, fetch_targets]
......@@ -1698,6 +1698,7 @@ class Program(object):
p._copy_param_info_from(self)
p._copy_data_info_from(self)
p._copy_dist_param_info_from(self)
return p
def _prune(self, targets):
......@@ -1938,6 +1939,25 @@ class Program(object):
"program, with represent the same topology")
self.global_block()._copy_param_info_from(other.global_block())
def _copy_dist_param_info_from(self, other):
"""
Copy the information of distributed information from other program.
Args:
other(Program): Other program
Returns:
None
"""
if not isinstance(other, Program):
raise TypeError("_copy_dist_param_info_from should be invoked with "
"Program")
self._is_distributed = other._is_distributed
self._is_chief = other._is_chief
self._slice_vars_and_attrs = other._slice_vars_and_attrs
self._endpoints = other._endpoints
self._distributed_lookup_table = other._distributed_lookup_table
def _copy_data_info_from(self, other):
"""
Copy the information of data variables from other program.
......
......@@ -165,6 +165,7 @@ def save_vars(executor,
save_vars(
executor,
main_program=main_program,
dirname=dirname,
vars=list(filter(predicate, main_program.list_vars())),
filename=filename)
......@@ -172,11 +173,18 @@ def save_vars(executor,
save_program = Program()
save_block = save_program.global_block()
if main_program is None:
main_program = default_main_program()
if not isinstance(main_program, Program):
raise TypeError("program should be as Program type or None")
save_var_map = {}
for each_var in vars:
# NOTE: don't save the variable which type is RAW
if each_var.type == core.VarDesc.VarType.RAW:
continue
if each_var.name == main_program._distributed_lookup_table:
continue
new_var = _clone_var_in_block_(save_block, each_var)
if filename is None:
save_block.append_op(
......@@ -198,6 +206,16 @@ def save_vars(executor,
outputs={},
attrs={'file_path': os.path.join(dirname, filename)})
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
attrs = {}
attrs['epmap'] = main_program._endpoints
attrs['dir'] = lookup_table_filename
attrs['lookup_table'] = main_program._distributed_lookup_table
save_block.append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
executor.run(save_program)
......@@ -379,11 +397,22 @@ def load_vars(executor,
load_prog = Program()
load_block = load_prog.global_block()
if main_program is None:
main_program = default_main_program()
if not isinstance(main_program, Program):
raise TypeError("program should be as Program type or None")
load_slice_vars = []
for each_var in main_program._slice_vars_and_attrs:
load_slice_vars.append(each_var[2].name)
load_var_map = {}
for each_var in vars:
assert isinstance(each_var, Variable)
if each_var.type == core.VarDesc.VarType.RAW:
continue
if each_var.name in load_slice_vars:
continue
new_var = _clone_var_in_block_(load_block, each_var)
if filename is None:
load_block.append_op(
......@@ -406,9 +435,6 @@ def load_vars(executor,
attrs={'file_path': os.path.join(dirname, filename)})
executor.run(load_prog)
if main_program is None:
main_program = default_main_program()
# load slice vars on pserver, if have it.
_load_slice_up_vars(executor, dirname,
main_program._slice_vars_and_attrs)
......@@ -618,13 +644,6 @@ def save_inference_model(dirname,
if main_program is None:
main_program = default_main_program()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
# when a pserver and a trainer running on the same machine, mkdir may conflict
try:
os.makedirs(dirname)
......@@ -642,6 +661,9 @@ def save_inference_model(dirname,
# it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be
# more flexible.
origin_program = main_program.clone()
if export_for_deployment:
main_program = main_program.clone()
global_block = main_program.global_block()
......@@ -666,8 +688,11 @@ def save_inference_model(dirname,
with open(model_basename + ".main_program", "wb") as f:
f.write(main_program.desc.serialize_to_string())
main_program._copy_dist_param_info_from(origin_program)
if params_filename is not None:
params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename)
......@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
slice_var = var_tuple[2]
end = start + slice_var.shape[0]
orig_var_name = orig_var.name
orig_var.name = "{}.origin".format(orig_var_name)
clone_orig_var = load_block.create_var(
name=orig_var.name,
type=orig_var.type,
......@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
type='load',
inputs={},
outputs={'Out': [clone_orig_var]},
attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
attrs={'file_path': os.path.join(dirname, orig_var_name)})
load_block.append_op(
type="slice",
inputs={'Input': clone_orig_var},
......@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
'starts': [start],
'ends': [end]})
need_delete_vars.append(clone_orig_var)
load_block.append_op(
type='delete_var',
inputs={'X': need_delete_vars}, )
......
......@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
def increment(x, value=1.0, in_place=True):
"""
This function performs an operation that increments each value in the
This function performs an operation that increments the value in the
input :math:`x` by an amount: :math:`value` as mentioned in the input
parameter. This operation is performed in-place by default.
parameter. This operation is performed in-place by default. Notice that
the number of elements in :math:`x` must be equal to 1.
Args:
x (Variable|list): The tensor that has the input values.
......@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
data = fluid.layers.data(name='data', shape=[1], dtype='float32',
append_batch_size=False)
data = fluid.layers.increment(x=data, value=3.0, in_place=True)
"""
helper = LayerHelper("increment", **locals())
......
......@@ -1029,6 +1029,7 @@ def density_prior_box(input,
clip=False,
steps=[0.0, 0.0],
offset=0.5,
flatten_to_2d=False,
name=None):
"""
**Density Prior Box Operator**
......@@ -1065,22 +1066,24 @@ def density_prior_box(input,
height/weight of the input will be automatically calculated.
Default: [0., 0.]
offset(float): Prior boxes center offset. Default: 0.5
flatten_to_2d(bool): Whether to flatten output prior boxes and variance
to 2D shape, the second dim is 4. Default: False.
name(str): Name of the density prior box op. Default: None.
Returns:
tuple: A tuple with two Variable (boxes, variances)
boxes: the output density prior boxes of PriorBox.
The layout is [H, W, num_priors, 4].
H is the height of input, W is the width of input,
num_priors is the total
box count of each position of input.
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input,
num_priors is the total box count of each position of input.
variances: the expanded variances of PriorBox.
The layout is [H, W, num_priors, 4].
H is the height of input, W is the width of input
num_priors is the total
box count of each position of input
The layout is [H, W, num_priors, 4] when flatten_to_2d is False.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input
num_priors is the total box count of each position of input.
Examples:
......@@ -1089,14 +1092,11 @@ def density_prior_box(input,
box, var = fluid.layers.density_prior_box(
input=conv1,
image=images,
min_sizes=[100.],
max_sizes=[200.],
aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
densities=[3, 4],
fixed_sizes=[50., 60.],
fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
flip=True,
clip=True)
densities=[4, 2, 1],
fixed_sizes=[32.0, 64.0, 128.0],
fixed_ratios=[1.],
clip=True,
flatten_to_2d=True)
"""
helper = LayerHelper("density_prior_box", **locals())
dtype = helper.input_dtype()
......@@ -1127,14 +1127,11 @@ def density_prior_box(input,
'step_w': steps[0],
'step_h': steps[1],
'offset': offset,
'densities': densities,
'fixed_sizes': fixed_sizes,
'fixed_ratios': fixed_ratios,
'flatten_to_2d': flatten_to_2d,
}
if densities is not None and len(densities) > 0:
attrs['densities'] = densities
if fixed_sizes is not None and len(fixed_sizes) > 0:
attrs['fixed_sizes'] = fixed_sizes
if fixed_ratios is not None and len(fixed_ratios) > 0:
attrs['fixed_ratios'] = fixed_ratios
box = helper.create_variable_for_type_inference(dtype)
var = helper.create_variable_for_type_inference(dtype)
helper.append_op(
......
......@@ -2134,11 +2134,16 @@ def pool2d(input,
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
pool_size (int): The side length of pooling windows. All pooling
windows are squares with pool_size on a side.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be a square of an int.
pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
it must contain two integers, (pool_stride_Height, pool_stride_Width).
Otherwise, the pool stride size will be a square of an int.
pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
Otherwise, the pool padding size will be a square of an int.
global_pooling (bool): ${global_pooling_comment}
use_cudnn (bool): ${use_cudnn_comment}
ceil_mode (bool): ${ceil_mode_comment}
......@@ -6967,18 +6972,18 @@ def prelu(x, mode, param_attr=None, name=None):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
y = \max(0, x) + alpha * \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing. It supports all, channel
and element. all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
......@@ -6987,7 +6992,7 @@ def prelu(x, mode, param_attr=None, name=None):
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
......
......@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
filter_size,
param_attr=None,
act="sigmoid",
pool_type="max"):
pool_type="max",
bias_attr=None):
"""
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
......@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
Default :math:`max`.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
Return:
Variable: The final result after Sequence Convolution and Pooling.
......@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
bias_attr=bias_attr,
act=act)
pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
......
......@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
assert len(results[0]) == len(transpiler_results[0])
for i in range(len(results[0])):
np.testing.assert_almost_equal(
results[0][i], transpiler_results[0][i], decimal=5)
results[0][i], transpiler_results[0][i], decimal=4)
print("infer results: ", results[0])
......
......@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):
class TestPriorBox(unittest.TestCase):
def test_prior_box(self):
data_shape = [3, 224, 224]
images = fluid.layers.data(
name='pixel', shape=data_shape, dtype='float32')
conv1 = fluid.layers.conv2d(images, 3, 3, 2)
box, var = layers.prior_box(
input=conv1,
image=images,
min_sizes=[100.0],
aspect_ratios=[1.],
flip=True,
clip=True)
assert len(box.shape) == 4
assert box.shape == var.shape
assert box.shape[3] == 4
program = Program()
with program_guard(program):
data_shape = [3, 224, 224]
images = fluid.layers.data(
name='pixel', shape=data_shape, dtype='float32')
conv1 = fluid.layers.conv2d(images, 3, 3, 2)
box, var = layers.prior_box(
input=conv1,
image=images,
min_sizes=[100.0],
aspect_ratios=[1.],
flip=True,
clip=True)
assert len(box.shape) == 4
assert box.shape == var.shape
assert box.shape[3] == 4
class TestDensityPriorBox(unittest.TestCase):
def test_density_prior_box(self):
data_shape = [3, 224, 224]
images = fluid.layers.data(
name='pixel', shape=data_shape, dtype='float32')
conv1 = fluid.layers.conv2d(images, 3, 3, 2)
box, var = layers.density_prior_box(
input=conv1,
image=images,
densities=[3, 4],
fixed_sizes=[50., 60.],
fixed_ratios=[1.0],
clip=True)
assert len(box.shape) == 4
assert box.shape == var.shape
assert box.shape[3] == 4
program = Program()
with program_guard(program):
data_shape = [3, 224, 224]
images = fluid.layers.data(
name='pixel', shape=data_shape, dtype='float32')
conv1 = fluid.layers.conv2d(images, 3, 3, 2)
box, var = layers.density_prior_box(
input=conv1,
image=images,
densities=[3, 4],
fixed_sizes=[50., 60.],
fixed_ratios=[1.0],
clip=True)
assert len(box.shape) == 4
assert box.shape == var.shape
assert box.shape[-1] == 4
class TestAnchorGenerator(unittest.TestCase):
......
......@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
'offset': self.offset,
'densities': self.densities,
'fixed_sizes': self.fixed_sizes,
'fixed_ratios': self.fixed_ratios
'fixed_ratios': self.fixed_ratios,
'flatten_to_2d': self.flatten_to_2d
}
self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
......@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
self.set_data()
def set_density(self):
self.densities = []
self.fixed_sizes = []
self.fixed_ratios = []
self.densities = [4, 2, 1]
self.fixed_sizes = [32.0, 64.0, 128.0]
self.fixed_ratios = [1.0]
self.layer_w = 17
self.layer_h = 17
self.image_w = 533
self.image_h = 533
self.flatten_to_2d = False
def init_test_params(self):
self.layer_w = 32
self.layer_h = 32
self.image_w = 40
self.image_h = 40
self.set_density()
self.step_w = float(self.image_w) / float(self.layer_w)
self.step_h = float(self.image_h) / float(self.layer_h)
......@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
self.variances = [0.1, 0.1, 0.2, 0.2]
self.variances = np.array(self.variances, dtype=np.float).flatten()
self.set_density()
self.clip = True
self.num_priors = 0
if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
......@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
(self.layer_h, self.layer_w, self.num_priors, 1))
self.out_boxes = out_boxes.astype('float32')
self.out_var = out_var.astype('float32')
if self.flatten_to_2d:
self.out_boxes = self.out_boxes.reshape((-1, 4))
self.out_var = self.out_var.reshape((-1, 4))
class TestDensityPriorBox(TestDensityPriorBoxOp):
......@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
self.densities = [3, 4]
self.fixed_sizes = [1.0, 2.0]
self.fixed_ratios = [1.0]
self.layer_w = 32
self.layer_h = 32
self.image_w = 40
self.image_h = 40
self.flatten_to_2d = True
if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
class TestFusionTransposeFlattenConcationOp(OpTest):
def setUp(self):
self.init_test_case()
self.op_type = "fusion_transpose_flatten_concat"
ins = []
flats = []
for i in range(len(self.shapes)):
in_shape = self.shapes[i]
a = np.random.random(in_shape).astype("float32")
ins.append(("x%d" % i, a))
b = a.transpose(self.trans_axis)
flat_shape = (np.prod(b.shape[:self.flatten_axis]),
np.prod(b.shape[self.flatten_axis:]))
c = b.reshape(flat_shape)
flats.append(c)
out = np.concatenate(flats, axis=self.concat_axis)
self.inputs = {'X': ins}
self.attrs = {
'trans_axis': list(self.trans_axis),
'flatten_axis': self.flatten_axis,
'concat_axis': self.concat_axis
}
self.outputs = {'Out': out}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
self.check_output_with_place(place, 1e-6)
else:
pass
def init_test_case(self):
self.shapes = [(3, 4, 17, 17), (3, 8, 7, 7), (3, 12, 5, 5)]
self.trans_axis = (0, 2, 3, 1)
self.flatten_axis = 1
self.concat_axis = 1
class TestCase1(TestFusionTransposeFlattenConcationOp):
def init_test_case(self):
self.shapes = [(3, 4, 18, 17), (3, 8, 18, 7), (6, 12, 9, 5)]
self.trans_axis = (0, 2, 3, 1)
self.flatten_axis = 2
self.concat_axis = 1
class TestCase2(TestFusionTransposeFlattenConcationOp):
def init_test_case(self):
self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
self.trans_axis = (0, 2, 3, 1)
self.flatten_axis = 2
self.concat_axis = 0
class TestCase3(TestFusionTransposeFlattenConcationOp):
def init_test_case(self):
self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
self.trans_axis = (0, 3, 2, 1)
self.flatten_axis = 1
self.concat_axis = 1
class TestCase4(TestFusionTransposeFlattenConcationOp):
def init_test_case(self):
self.shapes = [(3, 8, 9, 17), (8, 3, 9, 17), (4, 6, 9, 17)]
self.trans_axis = (0, 2, 1, 3)
self.flatten_axis = 3
self.concat_axis = 1
class TestCase5(TestFusionTransposeFlattenConcationOp):
def init_test_case(self):
self.shapes = [(3, 8, 9, 17, 2), (3, 8, 2, 17, 9), (3, 17, 9, 8, 2)]
self.trans_axis = (0, 2, 1, 4, 3)
self.flatten_axis = 1
self.concat_axis = 1
if __name__ == '__main__':
unittest.main()
......@@ -202,6 +202,17 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_unpad(x=x, length=length))
print(str(program))
def test_pool2d(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
self.assertIsNotNone(
layers.pool2d(
x,
pool_size=[5, 3],
pool_stride=[1, 2],
pool_padding=(2, 1)))
def test_lstm_unit(self):
program = Program()
with program_guard(program):
......
......@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
lod.append(nmsed_num)
if nmsed_num == 0: continue
tmp_det_out = []
for c, indices in nmsed_outs.items():
for idx in indices:
xmin, ymin, xmax, ymax = boxes[n][idx][:]
det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
tmp_det_out.append(
[c, scores[n][c][idx], xmin, ymin, xmax, ymax])
sorted_det_out = sorted(
tmp_det_out, key=lambda tup: tup[0], reverse=False)
det_outs.extend(sorted_det_out)
return det_outs, lod
......@@ -210,7 +215,7 @@ class TestMulticlassNMSOp(OpTest):
class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
def set_argument(self):
# Here set 2.0 to test the case there is no outputs.
# In practical use, 0.0 < score_threshold < 1.0
# In practical use, 0.0 < score_threshold < 1.0
self.score_threshold = 2.0
......
......@@ -644,6 +644,9 @@ in a single call.")
else:
recv_inputs.append(single_trainer_var)
self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
endpoint)
# step 3
# Create a union-find data structure from optimize ops,
# If two ops are connected, we could add these two ops
......@@ -766,7 +769,7 @@ in a single call.")
grad_to_block_id, merged_var,
lr_ops)
# dedup grad to ids list
# dedup grad to ids list
grad_to_block_id = list(set(grad_to_block_id))
# append global ops
if global_ops:
......@@ -827,8 +830,8 @@ in a single call.")
attrs=attrs)
# add distributed attrs
pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
endpoint)
pserver_program._slice_vars_and_attrs = list(
self._slice_params_and_optimizes.values())
pserver_program._sync_with_cpp()
# save pserver program to generate pserver side startup relatively.
......@@ -941,12 +944,12 @@ to transpile() call.")
outputs={"Out": startup_tmpvar})
# add slice vars
s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
return s_prog
def _get_slice_vars_and_attrs(self, endpoint):
slice_vars_and_attrs = []
slice_vars_and_attrs = {}
block_suffix = "block"
for param in self.param_grad_ep_mapping[endpoint]["params"]:
orig_var_name, block_name, _ = self._get_varname_parts(param.name)
......@@ -960,8 +963,7 @@ to transpile() call.")
slice_vars = self.param_var_mapping[orig_var_name]
for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0]
slice_vars_and_attrs.append([orig_var, skip_dim0, param])
slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
return slice_vars_and_attrs
# ====================== private transpiler functions =====================
......@@ -1662,10 +1664,10 @@ to transpile() call.")
if key in ["Param", "Grad", "LearningRate"]:
continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
param_var = new_inputs["Param"]
# update accumulator variable shape
param_shape = new_inputs["Param"].shape
new_shape = self._get_optimizer_input_shape(opt_op.type, key,
var.shape, param_shape)
new_shape = self._get_optimizer_input_shape(
opt_op.type, key, var.shape, param_var.shape)
tmpvar = pserver_block.create_var(
name=var.name,
persistable=var.persistable,
......@@ -1673,6 +1675,13 @@ to transpile() call.")
shape=new_shape)
new_inputs[key] = tmpvar
# var shape been changed
if new_shape != var.shape:
slice_var_args = self._slice_params_and_optimizes[
param_var.name]
self._slice_params_and_optimizes[
var.name] = [var, slice_var_args[1], tmpvar]
# change output's ParamOut variable
outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, opt_op)
......
......@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
bash build_scripts/install_nccl2.sh && rm -rf build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
......
......@@ -50,6 +50,15 @@ function do_cpython_build {
mkdir -p ${prefix}/lib
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then
wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
tar -zxf sqlite-autoconf-3250300.tar.gz
cd sqlite-autoconf-3250300
./configure --prefix=/usr/local
make -j8 && make install
cd ../ && rm sqlite-autoconf-3250300.tar.gz
fi
# NOTE --enable-shared for generating libpython shared library needed for
# linking of some of the nupic.core test executables.
if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
......@@ -59,9 +68,9 @@ function do_cpython_build {
make -j8 > /dev/null
make altinstall > /dev/null
else
CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
make -j8 > /dev/null
make install > /dev/null
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
fi
popd
echo "ZZZ looking for libpython"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册