提交 cf4dbaa9 编写于 作者: L lujun

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into hot-fix-utest

......@@ -75,7 +75,6 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface"
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
option(WITH_WBAES "Compile PaddlePaddle with WBAES support" ON)
# PY_VERSION
if(NOT PY_VERSION)
......@@ -149,7 +148,6 @@ include(external/dlpack)
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
include(external/wbaes) # download wbaes
if (NOT WIN32)
# there is no official support of nccl, cupti in windows
......
......@@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA)
if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER)
if(WITH_WBAES)
add_definitions(-DPADDLE_WITH_WBAES)
endif(WITH_WBAES)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_WBAES})
return()
ENDIF(NOT ${WITH_WBAES})
INCLUDE(ExternalProject)
SET(WBAES_DST_DIR "wbaes")
SET(WBAES_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(WBAES_INSTALL_DIR ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
SET(WBAES_ROOT ${WBAES_INSTALL_DIR})
SET(WBAES_INC_DIR ${WBAES_ROOT}/include)
SET(WBAES_LIB_DIR ${WBAES_ROOT}/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
IF(APPLE)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
ELSEIF(WIN32)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.lib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dll)
ELSE()
SET(WBAES_TAG "v1.0.2" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.so)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.so)
ENDIF()
SET(WBAES_PROJECT "extern_wbaes")
MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
SET(WBAES_SOURCE_DIR "${THIRD_PARTY_PATH}/wbaes")
SET(WBAES_DOWNLOAD_DIR "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
ExternalProject_Add(
${WBAES_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${WBAES_SOURCE_DIR}
URL ${WBAES_URL}
DOWNLOAD_DIR ${WBAES_DOWNLOAD_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
)
INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
......@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
# Only deps libwbaes.so, not link
if("${cc_library_DEPS};" MATCHES "wbaes;")
list(REMOVE_ITEM cc_library_DEPS wbaes)
if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
list(APPEND cc_library_DEPS dynload_wbaes)
endif()
add_dependencies(${TARGET_NAME} wbaes)
endif()
# Only deps libmklml.so, not link
if("${cc_library_DEPS};" MATCHES "mklml;")
list(REMOVE_ITEM cc_library_DEPS mklml)
......
......@@ -170,14 +170,6 @@ copy(snappystream_lib
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
if (WITH_WBAES)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
copy(wbaes_lib
SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS wbaes)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
......
......@@ -235,6 +235,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
......
......@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
namespace paddle {
namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
int server_fd, new_socket;
struct sockaddr_in address;
int addrlen = sizeof(address);
char buffer[1024] = {0};
int opt = 0;
// creating socket fd
if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
PADDLE_THROW("create server fd failed");
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt,
sizeof(opt)))
PADDLE_THROW("set socket opt failed");
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
PADDLE_THROW("binding failed on ep: %s", ep);
VLOG(3) << "listening on: " << ep;
if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
if ((new_socket =
accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
reinterpret_cast<socklen_t *>(&addrlen))) < 0)
PADDLE_THROW("accept the new socket fd failed");
if (read(new_socket, buffer, 1024) < 0)
PADDLE_THROW("reading the ncclUniqueId from socket failed");
VLOG(3) << "recevived the ncclUniqueId";
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
VLOG(3) << "closing the socket server: " << ep;
close(server_fd);
}
void NCCLParallelContext::SendNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
// struct sockaddr_in address;
int sock = 0;
struct sockaddr_in serv_addr;
char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
PADDLE_THROW("create socket failed");
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
PADDLE_THROW("invalied address: %s", ep);
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< " is not ready, will retry after 3 seconds...";
std::this_thread::sleep_for(std::chrono::seconds(3));
continue;
}
VLOG(3) << "sending the ncclUniqueId to " << ep;
send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
break;
}
}
void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
if (strategy_.local_rank_ == root) {
for (auto ep : strategy_.trainer_endpoints_) {
if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
}
} else {
RecvNCCLID(strategy_.current_endpoint_, nccl_id);
}
}
void NCCLParallelContext::Init() {
ncclUniqueId nccl_id;
ncclComm_t comm;
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
platform::dynload::ncclGetUniqueId(&nccl_id);
BcastNCCLId(&nccl_id, 0);
} else {
BcastNCCLId(&nccl_id, 0);
}
int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
&comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
dev_ctx->set_nccl_comm(comm);
}
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// network header files
#ifndef _WIN32
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() {}
virtual void Init() = 0;
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class NCCLParallelContext : ParallelContext {
public:
explicit NCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~NCCLParallelContext() {}
void BcastNCCLId(ncclUniqueId* nccl_id, int root);
void Init() override;
protected:
void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
};
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];
strategy.nranks_ = 2;
strategy.local_rank_ = local_rank;
return strategy;
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
auto strategy = GetStrategy(local_rank);
platform::CUDAPlace gpu(local_rank);
imperative::NCCLParallelContext ctx(strategy, gpu);
ctx.BcastNCCLId(nccl_id, 0);
}
TEST(BcastNCCLId, Run) {
ncclUniqueId nccl_id;
platform::dynload::ncclGetUniqueId(&nccl_id);
std::thread t(BcastNCCLId, 0, &nccl_id);
ncclUniqueId recv_nccl_id;
BcastNCCLId(1, &recv_nccl_id);
t.join();
EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
NCCL_UNIQUE_ID_BYTES));
}
#endif
......@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
current_vars_map[out->Name()] = out;
}
VLOG(3) << "input var name: " << out->Name()
VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
}
......@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func(
......
......@@ -87,6 +87,7 @@ const std::vector<std::string> kAnakinSubgraphPasses({
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"runtime_context_cache_pass", //
// "identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
......@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif //
"transpose_flatten_concat_fuse_pass",
});
......@@ -117,6 +117,10 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
// TODO(luotao): runtime_context_cache_pass should be located in the
// front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
// will enhance this pass later.
"runtime_context_cache_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
......@@ -132,7 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"runtime_context_cache_pass", //
});
use_gpu_ = false;
......
......@@ -23,18 +23,11 @@ namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false);
cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN();
cfg->pass_builder()->SetPasses(
{"infer_clean_graph_pass", "mkldnn_placement_pass",
"depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
"conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
"conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
"fc_fuse_pass", "is_test_pass"});
}
template <typename T>
......@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
std::to_string(num_images) + " is bigger than all test data size.");
PaddleTensor images;
images.name = "input";
images.name = "image";
images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels;
labels.name = "labels";
labels.name = "label";
labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images);
......@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
TensorReader<float> image_reader(file, images_offset_in_file,
image_batch_shape, "input");
image_batch_shape, "image");
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label");
......
# INT8 MKL-DNN quantization
This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
## 0. Install PaddlePaddle
Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
```
cmake .. -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
```
Note: MKL-DNN and MKL are required.
## 1. Enable INT8 MKL-DNN quantization
For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
* ### Create Analysis config
INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
* ### Create quantize config by analysis config
We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
```cpp
//Enable MKL-DNN quantization
cfg.EnableMkldnnQuantizer();
//use analysis config to call the MKL-DNN quantization config
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
```
## 2. Accuracy and Performance benchmark
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
>**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.48% | 0.15% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.36% | 0.42% |
>**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 13.17 images/s | 49.84 images/s | 3.78 |
| MobileNet-V1 | Full ImageNet Val | 75.49 images/s | 232.38 images/s | 3.07 |
Notes:
* Measurement of accuracy requires a model which accepts two inputs: data and labels.
* Different sampling batch size data may cause slight difference on INT8 top accuracy.
* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious.
## 3. Commands to reproduce the above accuracy and performance benchmark
* #### Full dataset (Single core)
* ##### Download full ImageNet Validation Dataset
```bash
cd /PATH/TO/PADDLE/build
python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
```
The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
* ##### ResNet50 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
* ##### Mobilenet-v1 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
......@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor,
int num_threads, int tid) {
int num_times = FLAGS_repeat;
int iterations = inputs.size(); // process the whole dataset ...
if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
if (FLAGS_iterations > 0 &&
FLAGS_iterations < static_cast<int64_t>(inputs.size()))
iterations =
FLAGS_iterations; // ... unless the number of iterations is set
outputs->resize(iterations);
......@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor,
#endif
if (!FLAGS_zero_copy) {
run_timer.tic();
for (size_t i = 0; i < iterations; i++) {
for (int i = 0; i < iterations; i++) {
for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
}
}
elapsed_time = run_timer.toc();
} else {
for (size_t i = 0; i < iterations; i++) {
for (int i = 0; i < iterations; i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic();
for (int j = 0; j < num_times; j++) {
......@@ -366,9 +367,8 @@ void TestOneThreadPrediction(
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
0);
PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
}
void TestMultiThreadPrediction(
......
......@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
......@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {
framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block");
exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
Outputs(framework::GradVarName("Input")));
const auto &ins = Inputs("Input");
const auto &d_ins = Outputs(framework::GradVarName("Input"));
const auto &conds = Inputs("Cond");
const auto &d_conds = Outputs(framework::GradVarName("Cond"));
std::vector<std::string> ins_conds_grads;
ins_conds_grads.reserve(ins.size() + conds.size());
for (auto &in : ins) {
ins_conds_grads.emplace_back(framework::GradVarName(in));
}
for (auto &cond : conds) {
ins_conds_grads.emplace_back(framework::GradVarName(cond));
}
exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
ins_conds_grads);
AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
ins.size(), d_ins);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
Outputs(framework::GradVarName("Cond")));
AssignLocalGradientToGlobal(dev_place, cur_scope,
ins_conds_grads.data() + ins.size(),
conds.size(), d_conds);
}
}
private:
void AssignLocalGradientToGlobal(
const platform::Place &place, const framework::Scope &cur_scope,
const std::vector<std::string> &p_names,
const std::string *p_grad_names, size_t p_grad_names_num,
const std::vector<std::string> &pg_names) const {
for (size_t i = 0; i < p_names.size(); ++i) {
for (size_t i = 0; i < p_grad_names_num; ++i) {
auto out_grad_name = pg_names[i];
auto in_grad_name = framework::GradVarName(p_names[i]);
const auto &in_grad_name = p_grad_names[i];
auto *in_var = cur_scope.FindVar(in_grad_name);
if (in_var == nullptr) {
continue;
......
......@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
TEST(JITKernel_pool, more) {
const auto& kers = jit::KernelPool::Instance().AllKernels();
#if defined(__APPLE__) || defined(__OSX__)
EXPECT_EQ(kers.size(), 10UL);
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_EQ(kers.size(), 22UL);
#else
EXPECT_EQ(kers.size(), 8UL);
size_t target_num = 8;
#ifdef __AVX__
target_num += 2;
#endif
#ifdef PADDLE_WITH_MKLML
target_num += 12;
#endif
EXPECT_EQ(kers.size(), target_num);
}
TEST(JITKernel_pool, refer) {
......
......@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, platform::CPUPlace());
framework::proto::VarType::FP32, ctx.GetPlace());
return kt;
}
};
......
/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
#include <memory>
namespace paddle {
namespace operators {
class PixelShuffleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PixelShuffleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PixelShuffleOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
"Upscale_factor should devide the number of channel");
auto output_dims = input_dims;
output_dims[0] = input_dims[0];
output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
output_dims[2] = input_dims[2] * upscale_factor;
output_dims[3] = input_dims[3] * upscale_factor;
ctx->SetOutputDim("Out", output_dims);
}
};
class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor, default Tensor<float>), "
"the input feature data of PixelShuffleOp, the layout is [N C H W].");
AddOutput(
"Out",
"(Tensor, default Tensor<float>), the output of "
"PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
AddAttr<int>("upscale_factor",
"the factor to increase spatial resolution by.")
.SetDefault(1)
.AddCustomChecker([](const int& upscale_factor) {
PADDLE_ENFORCE_GE(upscale_factor, 1,
"upscale_factor should be larger than 0.");
});
AddComment(R"DOC(
Pixel Shuffle operator
This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
to a tensor of shape :math:`(C, H \times r, W \times r)`.
This is useful for implementing efficient sub-pixel convolution
with a stride of :math:`1/r`.
Please refer to the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient
Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
by Shi et. al (2016) for more details.
)DOC");
}
};
class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("pixel_shuffle_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
class PixelShuffleGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) should not be null");
auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
auto dx_dims = do_dims;
dx_dims[0] = do_dims[0];
dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
dx_dims[2] = do_dims[2] / upscale_factor;
dx_dims[3] = do_dims[3] / upscale_factor;
ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
ops::PixelShuffleGradMaker);
REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -12,23 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_WBAES
#include "paddle/fluid/platform/dynload/wbaes.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag wbaes_dso_flag;
void *wbaes_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
WBAES_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
#include "paddle/fluid/operators/pixel_shuffle_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PixelShuffleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto in_dims = in->dims();
auto o_dims = out->dims();
framework::Tensor t;
t.ShareDataWith(*in);
t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
std::vector<int> axis = {0, 1, 4, 2, 5, 3};
framework::Tensor o;
o.ShareDataWith(*out);
o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
out->Resize(o_dims);
}
};
template <typename DeviceContext, typename T>
class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto do_dims = dout->dims();
auto dx_dims = dx->dims();
framework::Tensor t;
t.ShareDataWith(*dout);
t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
std::vector<int> axis = {0, 1, 3, 5, 2, 4};
framework::Tensor o;
o.ShareDataWith(*dx);
o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
dx->Resize(dx_dims);
}
};
} // namespace operators
} // namespace paddle
......@@ -17,9 +17,6 @@ if (CUPTI_FOUND)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
if (WITH_WBAES)
cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes)
endif()
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
......
......@@ -48,8 +48,6 @@ DEFINE_string(
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
DEFINE_string(wbaes_dir, "", "Specify path for loading libwbaes.so.");
namespace paddle {
namespace platform {
namespace dynload {
......@@ -248,16 +246,6 @@ void* GetMKLMLDsoHandle() {
#endif
}
void* GetWBAESDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.so");
#endif
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
void* GetWBAESDsoHandle();
} // namespace dynload
} // namespace platform
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_WBAES
#include <WBAESLib.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag wbaes_dso_flag;
extern void *wbaes_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load wbaes routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_WBAES_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using wbaesFunc = decltype(&::__name); \
std::call_once(wbaes_dso_flag, []() { \
wbaes_dso_handle = paddle::platform::dynload::GetWBAESDsoHandle(); \
}); \
static void *p_##__name = dlsym(wbaes_dso_handle, #__name); \
return reinterpret_cast<wbaesFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_WBAES_WRAP(__name) DYNAMIC_LOAD_WBAES_WRAP(__name)
#define WBAES_ROUTINE_EACH(__macro) __macro(GSECF);
WBAES_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WBAES_WRAP);
#undef DYNAMIC_LOAD_WBAES_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor imperative_profiler)
tracer analysis_predictor imperative_profiler nccl_context)
if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
......
......@@ -29,7 +29,7 @@ namespace paddle {
namespace pybind {
// Bind Methods
void BindTracer(pybind11::module* m) {
void BindImperative(pybind11::module* m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__",
[](imperative::Tracer& self, framework::BlockDesc* root_block) {
......@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
})
.def("py_trace", &imperative::Tracer::PyTrace,
pybind11::return_value_policy::take_ownership);
// define parallel context
pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
*m, "ParallelStrategy", "");
parallel_strategy.def(pybind11::init())
.def_property(
"nranks",
[](const imperative::ParallelStrategy& self) { return self.nranks_; },
[](imperative::ParallelStrategy& self, int nranks) {
self.nranks_ = nranks;
})
.def_property("local_rank",
[](const imperative::ParallelStrategy& self) {
return self.local_rank_;
},
[](imperative::ParallelStrategy& self, int local_rank) {
self.local_rank_ = local_rank;
})
.def_property(
"trainer_endpoints",
[](const imperative::ParallelStrategy& self) {
return self.trainer_endpoints_;
},
[](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
self.trainer_endpoints_ = eps;
})
.def_property("current_endpoint",
[](const imperative::ParallelStrategy& self) {
return self.current_endpoint_;
},
[](imperative::ParallelStrategy& self,
const std::string& ep) { self.current_endpoint_ = ep; });
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
*m, "NCCLParallelContext");
nccl_ctx
.def(pybind11::init<const imperative::ParallelStrategy&,
const platform::CUDAPlace&>())
.def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
#endif
}
} // namespace pybind
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
......@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
using imperative::VarBase::VarBase; // Inherit constructors
};
void BindTracer(pybind11::module* m);
void BindImperative(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) {
})
.def_static("num_funcs", &imperative::PyLayer::NumFuncs);
BindTracer(&m);
BindImperative(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer(
......
......@@ -32,6 +32,7 @@ default_envs = {
"NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0",
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
}
GPUS = 8
......
......@@ -29,6 +29,9 @@ from .tracer import *
from . import profiler
from .profiler import *
from . import parallel
from .parallel import *
from . import checkpoint
from .checkpoint import *
......@@ -41,5 +44,6 @@ __all__ += base.__all__
__all__ += nn.__all__
__all__ += tracer.__all__
__all__ += profiler.__all__
__all__ += parallel.__all__
__all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__
......@@ -48,7 +48,7 @@ class Conv2D(layers.Layer):
bias_attr=None,
dtype=core.VarDesc.VarType.FP32):
assert param_attr is not False, "param_attr should not be False here."
super(Conv2D, self).__init__(name_scope)
super(Conv2D, self).__init__(name_scope, dtype)
self._groups = groups
self._stride = utils.convert_to_list(stride, 2, 'stride')
self._padding = utils.convert_to_list(padding, 2, 'padding')
......@@ -503,7 +503,7 @@ class FC(layers.Layer):
num_flatten_dims=1,
dtype=core.VarDesc.VarType.FP32,
act=None):
super(FC, self).__init__(name_scope)
super(FC, self).__init__(name_scope, dtype)
self._size = size
self._num_flatten_dims = num_flatten_dims
......@@ -608,7 +608,7 @@ class BatchNorm(layers.Layer):
do_model_average_for_mean_and_var=False,
fuse_with_relu=False,
use_global_stats=False):
super(BatchNorm, self).__init__(name_scope)
super(BatchNorm, self).__init__(name_scope, dtype)
self._param_attr = param_attr
self._param_attr = bias_attr
self._act = act
......@@ -760,7 +760,7 @@ class Embedding(layers.Layer):
param_attr=None,
dtype='float32'):
super(Embedding, self).__init__(name_scope)
super(Embedding, self).__init__(name_scope, dtype)
self._size = size
self._is_sparse = is_sparse
self._is_distributed = is_distributed
......@@ -1008,7 +1008,7 @@ class GRUUnit(layers.Layer):
gate_activation='sigmoid',
origin_mode=False,
dtype='float32'):
super(GRUUnit, self).__init__(name_scope)
super(GRUUnit, self).__init__(name_scope, dtype)
activation_dict = dict(
identity=0,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except jin compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .. import core
__all__ = ["prepare_context"]
ParallelStrategy = core.ParallelStrategy
__parallel_ctx__clz__ = None
def prepare_context(parallel_strategy, place):
global __parallel_ctx__clz__
assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
if isinstance(place, core.CUDAPlace):
__parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
place)
else:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert ("Only support CUDAPlace for now.")
__parallel_ctx__clz__.init()
class Env(object):
def __init__(self):
self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
@property
def nranks(self):
return self._nranks
@property
def local_rank(self):
return self._local_rank
@property
def dev_id(self):
return self._dev_id
@property
def current_endpoint(self):
return self._current_endpoint
......@@ -191,6 +191,7 @@ __all__ = [
'kldiv_loss',
'tree_conv',
'npair_loss',
'pixel_shuffle',
'fsp_matrix',
]
......@@ -480,6 +481,8 @@ def dynamic_lstm(input,
forward, _ = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
"""
assert _in_dygraph_mode(
) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstm', **locals())
size = size // 4
......@@ -864,6 +867,9 @@ def dynamic_lstmp(input,
proj_activation="tanh")
"""
assert _in_dygraph_mode(
) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstmp', **locals())
size = size // 4
......@@ -1035,6 +1041,9 @@ def dynamic_gru(input,
hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
"""
assert _in_dygraph_mode(
) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
helper = LayerHelper('gru', **locals())
dtype = helper.input_dtype()
......@@ -1751,6 +1760,8 @@ def sequence_conv(input,
Variable: output of sequence_conv
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype()
filter_shape = [filter_size * input.shape[1], num_filters]
......@@ -1810,6 +1821,8 @@ def sequence_softmax(input, use_cudnn=False, name=None):
dtype='float32', lod_level=1)
x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_softmax', **locals())
dtype = helper.input_dtype()
softmax_out = helper.create_variable_for_type_inference(dtype)
......@@ -2302,6 +2315,8 @@ def sequence_pool(input, pool_type, is_test=False):
last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_pool', **locals())
dtype = helper.input_dtype()
pool_out = helper.create_variable_for_type_inference(dtype)
......@@ -2341,6 +2356,8 @@ def sequence_concat(input, name=None):
out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_concat', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op(
......@@ -2468,6 +2485,8 @@ def sequence_slice(input, offset, length, name=None):
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper("sequence_slice", **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -3927,6 +3946,8 @@ def sequence_expand(x, y, ref_level=-1, name=None):
dtype='float32', lod_level=1)
out = layers.sequence_expand(x=x, y=y, ref_level=0)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_expand', input=x, **locals())
dtype = helper.input_dtype()
tmp = helper.create_variable_for_type_inference(dtype)
......@@ -3993,6 +4014,8 @@ def sequence_expand_as(x, y, name=None):
dtype='float32', lod_level=1)
out = layers.sequence_expand_as(x=x, y=y)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_expand_as', input=x, **locals())
dtype = helper.input_dtype()
tmp = helper.create_variable_for_type_inference(dtype)
......@@ -4039,6 +4062,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_pad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -4105,6 +4130,8 @@ def sequence_unpad(x, length, name=None):
out = fluid.layers.sequence_unpad(x=x, length=len)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_unpad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -5278,6 +5305,8 @@ def sequence_reshape(input, new_dim):
x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_reshape', **locals())
out = helper.create_variable_for_type_inference(helper.input_dtype())
helper.append_op(
......@@ -5812,6 +5841,8 @@ def im2sequence(input,
input=layer, stride=[1, 1], filter_size=[2, 2])
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
if isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
......@@ -6228,7 +6259,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
},
outputs={'Diff': diff,
'Out': loss},
attrs={'sigma': sigma})
attrs={'sigma': sigma if sigma is not None else 1.0})
return loss
......@@ -7589,6 +7620,8 @@ def sequence_scatter(input, index, updates, name=None):
output = fluid.layers.sequence_scatter(input, index, updates)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_scatter', **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -8677,6 +8710,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_enumerate', **locals())
out = helper.create_variable_for_type_inference(
helper.input_dtype(), stop_gradient=True)
......@@ -8716,6 +8751,8 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
Variable: The output sequence mask.
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_mask', **locals())
if name is None:
......@@ -9766,6 +9803,8 @@ def sequence_reverse(x, name=None):
Returns:
out(${y_type}): ${y_comment}
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper("sequence_reverse", **locals())
if name is None:
out = helper.create_variable_for_type_inference(dtype=x.dtype)
......@@ -10923,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
return l2loss + celoss
def pixel_shuffle(x, upscale_factor):
"""
**Pixel Shuffle Layer**
This layer rearranges elements in a tensor of shape [N, C, H, W]
to a tensor of shape [N, C/r**2, H*r, W*r].
This is useful for implementing efficient sub-pixel convolution
with a stride of 1/r.
Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
by Shi et. al (2016) for more details.
.. code-block:: text
Given a 4-D tensor with the shape:
x.shape = [1, 9, 4, 4]
Given upscale_factor:
upscale_factor= 3
output shape is:
[1, 1, 12, 12]
Args:
x(Variable): The input tensor variable.
upscale_factor(int): factor to increase spatial resolution
Returns:
Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
Raises:
ValueError: If the square of upscale_factor cannot divide the channels of input.
Examples:
.. code-block:: python
input = fluid.layers.data(shape=[9,4,4])
output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
"""
helper = LayerHelper("pixel_shuffle", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
if not isinstance(upscale_factor, int):
raise TypeError("upscale factor must be int type")
helper.append_op(
type="pixel_shuffle",
inputs={"X": x},
outputs={"Out": out},
attrs={"upscale_factor": upscale_factor})
return out
def fsp_matrix(x, y):
"""
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
from test_conditional_block import *
if __name__ == '__main__':
unittest.main()
......@@ -302,8 +302,11 @@ use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 50
if not core.is_compiled_with_cuda():
# how many batches we use
batch_num = 50
else:
batch_num = 5
np.random.seed = 1
src_word_np = np.random.randint(
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestPixelShuffle(OpTest):
def setUp(self):
self.op_type = "pixel_shuffle"
n, c, h, w = 2, 9, 4, 4
up_factor = 3
shape = [n, c, h, w]
x = np.random.random(shape).astype("float32")
new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
w)
# reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
npresult = np.reshape(x, new_shape)
# transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
npresult = np.reshape(npresult, oshape)
self.inputs = {'X': x}
self.outputs = {'Out': npresult}
self.attrs = {'upscale_factor': up_factor}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
......@@ -157,10 +157,6 @@ package_data['paddle.libs']= []
package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_WBAES}' == 'ON':
package_data['paddle.libs'] += ['libwbaes' + ext_name]
shutil.copy('${WBAES_SHARED_LIB}', libs_path)
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_LIB}', libs_path)
shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册