提交 1800fef1 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into port_pybind11

......@@ -97,6 +97,14 @@ if(WITH_GPU)
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
......@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN)
return()
endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
"Anakin install path." FORCE)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
INCLUDE(ExternalProject)
set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
# the anakin install dir is only default one now
set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
......@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
endif()
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "3957ae9263eaa0b1986758dac60a88852afb09be"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
)
if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE})
link_directories(${ANAKIN_LIBRARY})
endif()
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
......@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib)
endif()
......
......@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope
:noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
......@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope
:noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel:
......
......@@ -34,21 +34,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......@@ -62,20 +51,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
......@@ -338,14 +323,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager {
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type()))
if (teller_set.count(func->func_type())) {
return true;
else {
} else {
return false;
}
};
......
......@@ -45,7 +45,6 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api)
......@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e
endif()
if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively(${ANAKIN_INCLUDE})
# compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction()
anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared)
if (WITH_TESTING)
# this test is unstable, disable it first.
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
#DEPS inference_anakin_api_shared)
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api dynload_cuda SERIAL)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
endif()
......@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc
activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
*const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(SoftMaxOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("softmax-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
......@@ -79,6 +79,12 @@ class TRTConvertValidation {
}
// Declare a Variable as input with random initialization.
void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
const nvinfer1::Dims& trt_dims) {
DeclVar(name, tensor_dims);
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
}
void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
// Declare TRT inputs.
......@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar(name, dims);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
// Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to
......@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
DeclVar(name, dim_vec);
}
void SetOp(const framework::proto::OpDesc& desc) {
......
......@@ -16,60 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n,
int post, int size) {
int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
if (idx_x < size) {
int idx_y = idx_x / post - (idx_x / (n * post)) * n;
z[idx_x] = x[idx_x] + y[idx_y];
}
}
template <typename T>
class ElementwiseAddKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
const auto x = ctx.Input<Tensor>("X");
const auto y = ctx.Input<Tensor>("Y");
auto z = ctx.Output<Tensor>("Out");
auto *z_data = z->mutable_data<T>(ctx.GetPlace());
auto &device = *(ctx.cuda_device_context().eigen_device());
const framework::DDim &x_dim = x->dims();
framework::DDim y_dim = y->dims();
int size = x->numel();
if (x_dim == y_dim) {
auto dim = framework::make_ddim({size});
auto z_eigen = framework::EigenTensor<T, 1>::From(*z, dim);
auto x_eigen = framework::EigenTensor<T, 1>::From(*x, dim);
auto y_eigen = framework::EigenTensor<T, 1>::From(*y, dim);
z_eigen.device(device) = x_eigen + y_eigen;
} else {
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
int threads = 512;
int grids = (size + threads - 1) / threads;
auto stream = ctx.cuda_device_context().stream();
ElementwiseAddCUDAKernel<T><<<grids, threads, 0, stream>>>(
x->data<T>(), y->data<T>(), z_data, n, post, size);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
......
......@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
// skip out, x, y
auto* out = dout;
auto *x = dout, *y = dout;
if (dx != nullptr) {
// In fact, we can just share memory, but it may cause a bug of memory
// optimizer
// dx->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dx);
}
if (dy == nullptr) return;
const framework::DDim& x_dim = dout->dims();
framework::DDim y_dim = dy->dims();
if (x_dim == y_dim) {
// dy->ShareDataWith(*dout);
framework::TensorCopy(*dout, ctx.GetPlace(),
ctx.template device_context<DeviceContext>(), dy);
if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
dy != nullptr && (dx->dims() == dy->dims())) {
elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else {
dy->mutable_data<T>(ctx.GetPlace());
// Perform reduction to dout to calculate dy
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
y_dim = trim_trailing_singular_dims(y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
auto& device =
*(ctx.template device_context<DeviceContext>().eigen_device());
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
auto eigen_dout = framework::EigenTensor<T, 3>::From(
*dout, framework::make_ddim({pre, n, post}));
auto eigen_dy =
framework::EigenTensor<T, 1>::From(*dy, framework::make_ddim({n}));
eigen_dy.device(device) = eigen_dout.sum(
framework::EigenDim<2>::From(framework::make_ddim({0, 2})));
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
dy);
}
}
};
......
......@@ -666,7 +666,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToDevices)
.def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
......@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig
from .concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select)
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip
from . import profiler
......@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
__all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [
'io',
......
......@@ -19,8 +19,7 @@ from .layers import fill_constant
from . import core
__all__ = [
'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
'Select'
'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
]
......@@ -35,10 +34,10 @@ class Go(BlockGuard):
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
self.construct_go_op()
self._construct_go_op()
return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
def construct_go_op(self):
def _construct_go_op(self):
main_program = self.helper.main_program
go_block = main_program.current_block()
parent_block = main_program.block(main_program.current_block()
......
......@@ -18,9 +18,7 @@ import six
from .framework import Program, default_main_program, Variable
from . import core
__all__ = [
'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
]
__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
g_scope = core.Scope()
......@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return fetch_count > 0
def fetch_var(name, scope=None, return_numpy=True):
def _fetch_var(name, scope=None, return_numpy=True):
"""
Fetch the value of the variable with the given name from the
given scope.
......@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list):
return str(feed_var_names + fetch_var_names)
def _as_lodtensor(data, place):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
>>> exe = fluid.executor(place)
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, place)
return tensor
class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
......@@ -250,35 +279,6 @@ class Executor(object):
self.program_caches = dict()
self._closed = False
def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
ndarray to LoDTensor. Please convert data to LoDTensor \
directly before feeding the data.\
")
# single tensor case
tensor = core.LoDTensor()
tensor.set(data, self.place)
return tensor
def _get_program_cache(self, program_cache_key):
return self.program_caches.get(program_cache_key, None)
......@@ -338,7 +338,7 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.as_lodtensor(cur_feed)
cur_feed = _as_lodtensor(cur_feed, self.place)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
......
......@@ -279,19 +279,19 @@ class ParallelExecutor(object):
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self.bcast_params()
self._bcast_params()
if return_numpy:
return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))]
def bcast_params(self):
def _bcast_params(self):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self.executor.bcast_params(set(self.persistable_vars))
self.executor._bcast_params(set(self.persistable_vars))
@property
def device_count(self):
......
......@@ -537,5 +537,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestRMSPropOptimizer(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
pserver2, startup2 = self.get_pserver(self.pserver2_ep)
self.assertEqual(len(pserver.blocks), 3)
# block1~2: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "rmsprop"])
# the variable #fc_w will be split into two blocks
fc_w_var = startup.global_block().var("fc_w.block1")
self.assertEqual(fc_w_var.shape, (500, 1000))
moment_var = startup.global_block().var("momentum_1")
self.assertEqual(moment_var.shape, (500, 1000))
if __name__ == "__main__":
unittest.main()
......@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest):
layers.assign(input=val, output=x)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
fetched_x = fluid.fetch_var("x")
fetched_x = fluid.executor._fetch_var("x")
self.assertTrue(
numpy.array_equal(fetched_x, val),
"fetch_x=%s val=%s" % (fetched_x, val))
......
......@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase):
next_data = np.random.uniform(
low=0, high=1000,
size=(batch_size, ) + shape[1:]).astype(dtype)
in_data.append(executor.as_lodtensor(next_data))
in_data.append(
fluid.executor._as_lodtensor(next_data, place))
self.inputs.append(in_data)
......
......@@ -22,7 +22,7 @@ pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
batch_size = 64
batch_size = 2
def position_encoding_init(n_position, d_pos_vec):
......
......@@ -751,14 +751,14 @@ class DistributeTranspiler(object):
out_name = op.output("Out")
ids_var = program.global_block().vars[ids_name[0]]
prefetch_input_vars = self.create_splited_vars(
prefetch_input_vars = self._create_splited_vars(
source_var=ids_var,
block=program.global_block(),
tag="_prefetch_in_")
self.all_prefetch_input_vars.append(prefetch_input_vars)
out_var = program.global_block().vars[out_name[0]]
prefetch_output_vars = self.create_splited_vars(
prefetch_output_vars = self._create_splited_vars(
source_var=out_var,
block=program.global_block(),
tag="_prefetch_out_")
......@@ -1040,7 +1040,7 @@ class DistributeTranspiler(object):
program.global_block()._sync_with_cpp()
return var_mapping
def create_splited_vars(self, source_var, block, tag):
def _create_splited_vars(self, source_var, block, tag):
return [
block.create_var(
name=str(source_var.name + tag + str(index)),
......@@ -1184,18 +1184,39 @@ class DistributeTranspiler(object):
program = optimize_block.program
pserver_block = program.global_block()
new_inputs = collections.OrderedDict()
# update param/grad shape first, then other inputs like
# moment can use the updated shape
def _get_param_block(opt_op):
# param is already created on global program
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input("Param")[0]):
param_block = p
break
return param_block
for key in opt_op.input_names:
if key == "Grad":
new_inputs[key] = merged_var
# For RMSProp optimizer
elif key == "Moment" or key == "MeanSquare":
param_block = _get_param_block(opt_op)
if not param_block:
return
moment_var = origin_program.global_block().vars[opt_op.input(
key)[0]]
tmpvar = pserver_block.create_var(
name=moment_var.name,
persistable=moment_var.persistable,
dtype=moment_var.dtype,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape=param_block.shape)
new_inputs[key] = tmpvar
elif key == "Param":
# param is already created on global program
param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]:
if same_or_split_var(p.name, opt_op.input(key)[0]):
param_block = p
break
param_block = _get_param_block(opt_op)
if not param_block:
return
tmpvar = pserver_block.create_var(
......@@ -1221,7 +1242,7 @@ class DistributeTranspiler(object):
for key in opt_op.input_names:
new_shape = None
if key in ["Param", "Grad", "LearningRate"]:
if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape
......
......@@ -57,10 +57,10 @@ class InferenceTranspiler(object):
scope = global_scope()
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope)
self.fuse_relu_mkldnn(program)
self._fuse_batch_norm(program, place, scope)
self._fuse_relu_mkldnn(program)
def fuse_relu_mkldnn(self, program):
def _fuse_relu_mkldnn(self, program):
'''
Transpile the program by fused relu activation for MKLDNN program.
......@@ -104,7 +104,7 @@ class InferenceTranspiler(object):
# And a better solution will be considered later.
program = program.clone()
def fuse_batch_norm(self, program, place, scope):
def _fuse_batch_norm(self, program, place, scope):
'''
Transpile the program by fused batch normalization.
......
requests==2.9.2
numpy>=1.12
numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1
recordio>=0.1.0
matplotlib
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册