提交 92c8ac8a 编写于 作者: L lujun

merge conflict, test=develop

......@@ -75,7 +75,6 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface"
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
option(WITH_WBAES "Compile PaddlePaddle with WBAES support" ON)
# PY_VERSION
if(NOT PY_VERSION)
......@@ -149,7 +148,6 @@ include(external/dlpack)
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
include(external/wbaes) # download wbaes
if (NOT WIN32)
# there is no official support of nccl, cupti in windows
......
......@@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA)
if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER)
if(WITH_WBAES)
add_definitions(-DPADDLE_WITH_WBAES)
endif(WITH_WBAES)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_WBAES})
return()
ENDIF(NOT ${WITH_WBAES})
INCLUDE(ExternalProject)
SET(WBAES_DST_DIR "wbaes")
SET(WBAES_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(WBAES_INSTALL_DIR ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
SET(WBAES_ROOT ${WBAES_INSTALL_DIR})
SET(WBAES_INC_DIR ${WBAES_ROOT}/include)
SET(WBAES_LIB_DIR ${WBAES_ROOT}/lib)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
IF(APPLE)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dylib)
ELSEIF(WIN32)
SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.lib)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dll)
ELSE()
SET(WBAES_TAG "v1.0.2" CACHE STRING "" FORCE)
SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.so)
SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.so)
ENDIF()
SET(WBAES_PROJECT "extern_wbaes")
MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
SET(WBAES_SOURCE_DIR "${THIRD_PARTY_PATH}/wbaes")
SET(WBAES_DOWNLOAD_DIR "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
ExternalProject_Add(
${WBAES_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${WBAES_SOURCE_DIR}
URL ${WBAES_URL}
DOWNLOAD_DIR ${WBAES_DOWNLOAD_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
)
INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
......@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
# Only deps libwbaes.so, not link
if("${cc_library_DEPS};" MATCHES "wbaes;")
list(REMOVE_ITEM cc_library_DEPS wbaes)
if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
list(APPEND cc_library_DEPS dynload_wbaes)
endif()
add_dependencies(${TARGET_NAME} wbaes)
endif()
# Only deps libmklml.so, not link
if("${cc_library_DEPS};" MATCHES "mklml;")
list(REMOVE_ITEM cc_library_DEPS mklml)
......
......@@ -170,14 +170,6 @@ copy(snappystream_lib
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
if (WITH_WBAES)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
copy(wbaes_lib
SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS wbaes)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
......
......@@ -236,6 +236,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
......
......@@ -242,6 +242,11 @@ void InMemoryDataFeed<T>::SetTrainerNum(int trainer_num) {
trainer_num_ = trainer_num;
}
template <typename T>
void InMemoryDataFeed<T>::SetFleetSendBatchSize(int64_t size) {
fleet_send_batch_size_ = size;
}
template <typename T>
void InMemoryDataFeed<T>::PutInsToChannel(const std::string& ins_str) {
#ifdef _LINUX
......@@ -361,8 +366,13 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
VLOG(3) << "GlobalShuffle() begin, thread_id=" << thread_id_;
auto fleet_ptr = FleetWrapper::GetInstance();
std::vector<std::vector<T*>> send_vec(trainer_num_);
std::vector<int> send_index(trainer_num_);
uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_;
for (auto& vec : send_vec) {
vec.reserve(fleet_send_batch_size_);
vec.reserve(reserve_len);
}
for (int i = 0; i < trainer_num_; ++i) {
send_index[i] = i;
}
std::vector<std::future<int32_t>> total_status;
auto interval = GetMemoryDataInterval();
......@@ -375,7 +385,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
int64_t node_id = random_num % trainer_num_;
send_vec[node_id].push_back(&((*memory_data_)[i]));
if (i % fleet_send_batch_size_ == 0 && i != 0) {
for (int j = 0; j < send_vec.size(); ++j) {
// shuffle the sequence of sending to avoid network timeout error
std::random_shuffle(send_index.begin(), send_index.end());
for (int index = 0; index < send_index.size(); ++index) {
int j = send_index[index];
std::string send_str;
SerializeIns(send_vec[j], &send_str);
VLOG(3) << "send str_length=" << send_str.length()
......@@ -388,7 +401,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
}
}
}
for (int j = 0; j < send_vec.size(); ++j) {
// shuffle the sequence of sending to avoid network timeout error
std::random_shuffle(send_index.begin(), send_index.end());
for (int index = 0; index < send_index.size(); ++index) {
int j = send_index[index];
if (send_vec[j].size() != 0) {
std::string send_str;
SerializeIns(send_vec[j], &send_str);
......
......@@ -94,6 +94,8 @@ class DataFeed {
virtual void SetThreadNum(int thread_num) {}
// This function will do nothing at default
virtual void SetTrainerNum(int trainer_num) {}
// This function will do nothing at default
virtual void SetFleetSendBatchSize(int64_t size) {}
virtual void SetFileListMutex(std::mutex* mutex) {
mutex_for_pick_file_ = mutex;
}
......@@ -212,6 +214,7 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
virtual void SetThreadId(int thread_id);
virtual void SetThreadNum(int thread_num);
virtual void SetTrainerNum(int trainer_num);
virtual void SetFleetSendBatchSize(int64_t size);
virtual void PutInsToChannel(const std::string& ins_str);
virtual void FillMemoryDataToChannel();
virtual void FillChannelToMemoryData();
......
......@@ -64,6 +64,17 @@ void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
}
}
// if you run distributed, and want to do global shuffle,
// set this before global shuffle.
// be sure you call CreateReaders before SetFleetSendBatchSize
template <typename T>
void DatasetImpl<T>::SetFleetSendBatchSize(int64_t size) {
fleet_send_batch_size_ = size;
for (auto reader : readers_) {
reader->SetFleetSendBatchSize(size);
}
}
template <typename T>
void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
const std::string& fs_ugi) {
......
......@@ -47,6 +47,8 @@ class Dataset {
virtual void SetThreadNum(int thread_num) = 0;
// set workers' num
virtual void SetTrainerNum(int trainer_num) = 0;
// set fleet send batch size
virtual void SetFleetSendBatchSize(int64_t size) = 0;
// set fs name and ugi
virtual void SetHdfsConfig(const std::string& fs_name,
const std::string& fs_ugi) = 0;
......@@ -59,6 +61,8 @@ class Dataset {
virtual int GetThreadNum() = 0;
// get worker num
virtual int GetTrainerNum() = 0;
// get fleet send batch size
virtual int64_t GetFleetSendBatchSize() = 0;
// get hdfs config
virtual std::pair<std::string, std::string> GetHdfsConfig() = 0;
// get data fedd desc
......@@ -98,6 +102,7 @@ class DatasetImpl : public Dataset {
virtual void SetFileList(const std::vector<std::string>& filelist);
virtual void SetThreadNum(int thread_num);
virtual void SetTrainerNum(int trainer_num);
virtual void SetFleetSendBatchSize(int64_t size);
virtual void SetHdfsConfig(const std::string& fs_name,
const std::string& fs_ugi);
virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
......@@ -105,6 +110,7 @@ class DatasetImpl : public Dataset {
virtual const std::vector<std::string>& GetFileList() { return filelist_; }
virtual int GetThreadNum() { return thread_num_; }
virtual int GetTrainerNum() { return trainer_num_; }
virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
virtual std::pair<std::string, std::string> GetHdfsConfig() {
return std::make_pair(fs_name_, fs_ugi_);
}
......@@ -137,6 +143,7 @@ class DatasetImpl : public Dataset {
std::string fs_name_;
std::string fs_ugi_;
unsigned int rand_seed;
int64_t fleet_send_batch_size_;
};
// use std::vector<MultiSlotType> as data type
......
......@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
// TODO(gongwb) :polish them!
if (is_encoded) {
VLOG(1) << "Use dgc allreduce mode";
}
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
......@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
void SetFuseParameterGroupsSize(int group_size);
int GetFuseParameterGroupsSize();
void SetFuseParameterMemorySize(uint64_t memory_size);
uint64_t GetFuseParameterMemorySize();
class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override;
template <typename AttrType>
void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const;
void SetGroupGradsAndParams(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const;
void SetGroupAccordingToGroupSize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const;
private:
bool IsSupportedVarType(const proto::VarType::Type &type) const;
void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const;
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, ir::Node *> &vars,
const std::string &fused_var_name,
const ParamsAndGrads &params_grads) const;
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -142,6 +142,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("memory_optimize_pass");
}
// runtime_context_cache pass should be the last pass to enable the attr of
// all original and fused operators. But no operators can be enabled this
// attr if putting it after MultiDevPass.
if (strategy_.cache_runtime_context_) {
VLOG(10) << "Add runtime_context_cache_pass";
AppendPass("runtime_context_cache_pass");
}
AppendMultiDevPass(strategy_);
if (strategy_.fuse_all_reduce_ops_) {
......@@ -243,7 +251,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
CreatePassesFromStrategy(false);
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
VLOG(3) << "apply " << pass->Type();
VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
......@@ -328,3 +336,4 @@ USE_PASS(graph_to_program_pass);
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass);
USE_PASS(runtime_context_cache_pass);
......@@ -107,6 +107,8 @@ struct BuildStrategy {
std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{true};
bool cache_runtime_context_{false};
// NOTE:
// Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through
......
......@@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync(
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* g = tensor->mutable_data<float>(place);
paddle::ps::Region reg(g, tensor->numel());
......
......@@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
}
close_open_fds_internal();
if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) {
if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) {
return -1;
}
exit(127);
......
......@@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base)
pass_library(expected_kernel_cache_pass base)
pass_library(quant_conv2d_dequant_fuse_pass inference)
pass_library(fillconstant_elementwisemul_fuse inference)
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/expected_kernel_cache_pass.h"
#include <memory>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace framework {
namespace ir {
void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Applies Expected Kernel Cache strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(expected_kernel_cache_pass,
paddle::framework::ir::ExpectedKernelCachePass);
......@@ -12,23 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_WBAES
#pragma once
#include "paddle/fluid/platform/dynload/wbaes.h"
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace platform {
namespace dynload {
namespace framework {
namespace ir {
std::once_flag wbaes_dso_flag;
void *wbaes_dso_handle = nullptr;
class ExpectedKernelCachePass : public Pass {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
#define DEFINE_WRAP(__name) DynLoad__##__name __name
WBAES_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace ir
} // namespace framework
} // namespace paddle
#endif
......@@ -23,7 +23,7 @@ namespace ir {
void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Applies Runtime Context Cache strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
if (n->IsOp() && n->Op()) {
n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
}
}
......
......@@ -899,50 +899,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.", type_);
if (!HasAttr(kEnableCacheExpectedKernel) || !kernel_type_) {
ChooseKernel(*runtime_ctx, scope, place);
}
OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = this->GetExpectedKernelType(
ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_,
KernelTypeToString(expected_kernel_key));
}
std::vector<KernelConfig>* kernel_configs =
GetKernelConfig(expected_kernel_key);
std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
// do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope = PrepareData(scope, expected_kernel_key,
&transfered_inplace_vars, runtime_ctx);
auto* transfer_scope =
PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
// exec scope is the scope that kernel actually executed on.
const Scope& exec_scope =
(transfer_scope == nullptr ? scope : *transfer_scope);
if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
dev_ctx = pool.Get(expected_kernel_key.place_);
if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
dev_ctx = pool.Get(kernel_type_->place_);
}
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
......@@ -951,8 +924,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
*runtime_ctx, kernel_configs));
(*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
kernel_configs));
if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered.
......@@ -978,6 +951,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
}
void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
const Scope& scope,
const platform::Place& place) const {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.", type_);
}
OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = this->GetExpectedKernelType(
ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = LibraryType::kPlain;
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_,
KernelTypeToString(expected_kernel_key));
}
kernel_type_.reset(new OpKernelType(expected_kernel_key));
kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
}
void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
......
......@@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// this Op's execution to save the elapsed time.
constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
/// name scope and same place, since the expected kerenl of this Op does not
/// change in the execution, it could be recorded only at the first iteration of
/// this Op's execution to save the elapsed time.
constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
......@@ -491,8 +497,13 @@ class OperatorWithKernel : public OperatorBase {
const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const;
void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
const platform::Place& place) const;
protected:
mutable OpKernelConfigsMap kernel_configs_map_;
mutable std::unique_ptr<OpKernelType> kernel_type_;
mutable std::unique_ptr<OpKernelFunc> kernel_func_;
mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
mutable const Scope* pre_scope_ = nullptr;
};
......
......@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
namespace paddle {
namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
int server_fd, new_socket;
struct sockaddr_in address;
int addrlen = sizeof(address);
char buffer[1024] = {0};
int opt = 0;
// creating socket fd
if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
PADDLE_THROW("create server fd failed");
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
PADDLE_THROW("set socket opt failed");
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
PADDLE_THROW("binding failed on ep: %s", ep);
VLOG(3) << "listening on: " << ep;
if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
if ((new_socket =
accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
reinterpret_cast<socklen_t *>(&addrlen))) < 0)
PADDLE_THROW("accept the new socket fd failed");
if (read(new_socket, buffer, 1024) < 0)
PADDLE_THROW("reading the ncclUniqueId from socket failed");
VLOG(3) << "recevived the ncclUniqueId";
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
VLOG(3) << "closing the socket server: " << ep;
close(server_fd);
}
void NCCLParallelContext::SendNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
// struct sockaddr_in address;
int sock = 0;
struct sockaddr_in serv_addr;
char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
PADDLE_THROW("create socket failed");
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
PADDLE_THROW("invalied address: %s", ep);
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< " is not ready, will retry after 3 seconds...";
std::this_thread::sleep_for(std::chrono::seconds(3));
continue;
}
VLOG(3) << "sending the ncclUniqueId to " << ep;
send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
break;
}
}
void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
if (strategy_.local_rank_ == root) {
for (auto ep : strategy_.trainer_endpoints_) {
if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
}
} else {
RecvNCCLID(strategy_.current_endpoint_, nccl_id);
}
}
void NCCLParallelContext::Init() {
ncclUniqueId nccl_id;
ncclComm_t comm;
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
platform::dynload::ncclGetUniqueId(&nccl_id);
BcastNCCLId(&nccl_id, 0);
} else {
BcastNCCLId(&nccl_id, 0);
}
int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
&comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
dev_ctx->set_nccl_comm(comm);
}
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() {}
virtual void Init() = 0;
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class NCCLParallelContext : ParallelContext {
public:
explicit NCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~NCCLParallelContext() {}
void BcastNCCLId(ncclUniqueId* nccl_id, int root);
void Init() override;
protected:
void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
};
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];
strategy.nranks_ = 2;
strategy.local_rank_ = local_rank;
return strategy;
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
auto strategy = GetStrategy(local_rank);
platform::CUDAPlace gpu(local_rank);
imperative::NCCLParallelContext ctx(strategy, gpu);
ctx.BcastNCCLId(nccl_id, 0);
}
TEST(BcastNCCLId, Run) {
ncclUniqueId nccl_id;
platform::dynload::ncclGetUniqueId(&nccl_id);
std::thread t(BcastNCCLId, 0, &nccl_id);
ncclUniqueId recv_nccl_id;
BcastNCCLId(1, &recv_nccl_id);
t.join();
EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
NCCL_UNIQUE_ID_BYTES));
}
#endif
......@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
current_vars_map[out->Name()] = out;
}
VLOG(3) << "input var name: " << out->Name()
VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
}
......@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func(
......
......@@ -231,6 +231,7 @@ void AnalysisConfig::Update() {
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
pass_builder()->DeletePass("runtime_context_cache_pass");
pass_builder()->DeletePass("expected_kernel_cache_pass");
}
if (use_mkldnn_) {
......
......@@ -86,7 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"infer_clean_graph_pass", //
"runtime_context_cache_pass", //
// "identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
......@@ -96,9 +97,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif //
"transpose_flatten_concat_fuse_pass",
"expected_kernel_cache_pass", //
});
use_gpu_ = true;
......@@ -116,7 +117,11 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"infer_clean_graph_pass", //
// TODO(luotao): runtime_context_cache_pass should be located in the
// front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
// will enhance this pass later.
"runtime_context_cache_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
......@@ -132,7 +137,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"runtime_context_cache_pass", //
"expected_kernel_cache_pass", //
});
use_gpu_ = false;
......
......@@ -23,18 +23,11 @@ namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false);
cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN();
cfg->pass_builder()->SetPasses(
{"infer_clean_graph_pass", "mkldnn_placement_pass",
"depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
"conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
"conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
"fc_fuse_pass", "is_test_pass"});
}
template <typename T>
......@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
std::to_string(num_images) + " is bigger than all test data size.");
PaddleTensor images;
images.name = "input";
images.name = "image";
images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels;
labels.name = "labels";
labels.name = "label";
labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images);
......@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
TensorReader<float> image_reader(file, images_offset_in_file,
image_batch_shape, "input");
image_batch_shape, "image");
TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label");
......
# INT8 MKL-DNN quantization
This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
## 0. Install PaddlePaddle
Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
```
cmake .. -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
```
Note: MKL-DNN and MKL are required.
## 1. Enable INT8 MKL-DNN quantization
For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
* ### Create Analysis config
INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
* ### Create quantize config by analysis config
We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
```cpp
//Enable MKL-DNN quantization
cfg.EnableMkldnnQuantizer();
//use analysis config to call the MKL-DNN quantization config
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
```
## 2. Accuracy and Performance benchmark
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
>**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.48% | 0.15% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.36% | 0.42% |
>**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 13.17 images/s | 49.84 images/s | 3.78 |
| MobileNet-V1 | Full ImageNet Val | 75.49 images/s | 232.38 images/s | 3.07 |
Notes:
* Measurement of accuracy requires a model which accepts two inputs: data and labels.
* Different sampling batch size data may cause slight difference on INT8 top accuracy.
* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious.
## 3. Commands to reproduce the above accuracy and performance benchmark
* #### Full dataset (Single core)
* ##### Download full ImageNet Validation Dataset
```bash
cd /PATH/TO/PADDLE/build
python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
```
The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
* ##### ResNet50 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
* ##### Mobilenet-v1 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
......@@ -55,6 +55,9 @@ DEFINE_bool(record_benchmark, false,
DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DEFINE_bool(warmup, false,
"Use warmup to calculate elapsed_time more accurately. "
"To reduce CI time, it sets false in default.");
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
......@@ -316,7 +319,8 @@ void PredictionRun(PaddlePredictor *predictor,
int num_threads, int tid) {
int num_times = FLAGS_repeat;
int iterations = inputs.size(); // process the whole dataset ...
if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
if (FLAGS_iterations > 0 &&
FLAGS_iterations < static_cast<int64_t>(inputs.size()))
iterations =
FLAGS_iterations; // ... unless the number of iterations is set
outputs->resize(iterations);
......@@ -329,14 +333,14 @@ void PredictionRun(PaddlePredictor *predictor,
#endif
if (!FLAGS_zero_copy) {
run_timer.tic();
for (size_t i = 0; i < iterations; i++) {
for (int i = 0; i < iterations; i++) {
for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
}
}
elapsed_time = run_timer.toc();
} else {
for (size_t i = 0; i < iterations; i++) {
for (int i = 0; i < iterations; i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic();
for (int j = 0; j < num_times; j++) {
......@@ -366,9 +370,10 @@ void TestOneThreadPrediction(
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
0);
PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
if (FLAGS_warmup) {
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
}
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
}
void TestMultiThreadPrediction(
......@@ -395,7 +400,10 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1);
}
#endif
PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid);
if (FLAGS_warmup) {
PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
tid);
}
PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
});
}
......
......@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
......@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {
framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block");
exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
Outputs(framework::GradVarName("Input")));
const auto &ins = Inputs("Input");
const auto &d_ins = Outputs(framework::GradVarName("Input"));
const auto &conds = Inputs("Cond");
const auto &d_conds = Outputs(framework::GradVarName("Cond"));
std::vector<std::string> ins_conds_grads;
ins_conds_grads.reserve(ins.size() + conds.size());
for (auto &in : ins) {
ins_conds_grads.emplace_back(framework::GradVarName(in));
}
for (auto &cond : conds) {
ins_conds_grads.emplace_back(framework::GradVarName(cond));
}
exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
ins_conds_grads);
AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
ins.size(), d_ins);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
Outputs(framework::GradVarName("Cond")));
AssignLocalGradientToGlobal(dev_place, cur_scope,
ins_conds_grads.data() + ins.size(),
conds.size(), d_conds);
}
}
private:
void AssignLocalGradientToGlobal(
const platform::Place &place, const framework::Scope &cur_scope,
const std::vector<std::string> &p_names,
const std::string *p_grad_names, size_t p_grad_names_num,
const std::vector<std::string> &pg_names) const {
for (size_t i = 0; i < p_names.size(); ++i) {
for (size_t i = 0; i < p_grad_names_num; ++i) {
auto out_grad_name = pg_names[i];
auto in_grad_name = framework::GradVarName(p_names[i]);
const auto &in_grad_name = p_grad_names[i];
auto *in_var = cur_scope.FindVar(in_grad_name);
if (in_var == nullptr) {
continue;
......
......@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
TEST(JITKernel_pool, more) {
const auto& kers = jit::KernelPool::Instance().AllKernels();
#if defined(__APPLE__) || defined(__OSX__)
EXPECT_EQ(kers.size(), 10UL);
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_EQ(kers.size(), 22UL);
#else
EXPECT_EQ(kers.size(), 8UL);
size_t target_num = 8;
#ifdef __AVX__
target_num += 2;
#endif
#ifdef PADDLE_WITH_MKLML
target_num += 12;
#endif
EXPECT_EQ(kers.size(), target_num);
}
TEST(JITKernel_pool, refer) {
......
/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
#include <memory>
namespace paddle {
namespace operators {
class PixelShuffleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PixelShuffleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PixelShuffleOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
"Upscale_factor should devide the number of channel");
auto output_dims = input_dims;
output_dims[0] = input_dims[0];
output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
output_dims[2] = input_dims[2] * upscale_factor;
output_dims[3] = input_dims[3] * upscale_factor;
ctx->SetOutputDim("Out", output_dims);
}
};
class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor, default Tensor<float>), "
"the input feature data of PixelShuffleOp, the layout is [N C H W].");
AddOutput(
"Out",
"(Tensor, default Tensor<float>), the output of "
"PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
AddAttr<int>("upscale_factor",
"the factor to increase spatial resolution by.")
.SetDefault(1)
.AddCustomChecker([](const int& upscale_factor) {
PADDLE_ENFORCE_GE(upscale_factor, 1,
"upscale_factor should be larger than 0.");
});
AddComment(R"DOC(
Pixel Shuffle operator
This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
to a tensor of shape :math:`(C, H \times r, W \times r)`.
This is useful for implementing efficient sub-pixel convolution
with a stride of :math:`1/r`.
Please refer to the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient
Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
by Shi et. al (2016) for more details.
)DOC");
}
};
class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("pixel_shuffle_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
class PixelShuffleGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) should not be null");
auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
auto dx_dims = do_dims;
dx_dims[0] = do_dims[0];
dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
dx_dims[2] = do_dims[2] / upscale_factor;
dx_dims[3] = do_dims[3] / upscale_factor;
ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
ops::PixelShuffleGradMaker);
REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PixelShuffleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto in_dims = in->dims();
auto o_dims = out->dims();
framework::Tensor t;
t.ShareDataWith(*in);
t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
std::vector<int> axis = {0, 1, 4, 2, 5, 3};
framework::Tensor o;
o.ShareDataWith(*out);
o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
out->Resize(o_dims);
}
};
template <typename DeviceContext, typename T>
class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto do_dims = dout->dims();
auto dx_dims = dx->dims();
framework::Tensor t;
t.ShareDataWith(*dout);
t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
std::vector<int> axis = {0, 1, 3, 5, 2, 4};
framework::Tensor o;
o.ShareDataWith(*dx);
o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
dx->Resize(dx_dims);
}
};
} // namespace operators
} // namespace paddle
......@@ -17,9 +17,6 @@ if (CUPTI_FOUND)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
if (WITH_WBAES)
cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes)
endif()
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
......
......@@ -48,8 +48,6 @@ DEFINE_string(
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
DEFINE_string(wbaes_dir, "", "Specify path for loading libwbaes.so.");
namespace paddle {
namespace platform {
namespace dynload {
......@@ -248,16 +246,6 @@ void* GetMKLMLDsoHandle() {
#endif
}
void* GetWBAESDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dll");
#else
return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.so");
#endif
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
void* GetWBAESDsoHandle();
} // namespace dynload
} // namespace platform
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_WBAES
#include <WBAESLib.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag wbaes_dso_flag;
extern void *wbaes_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load wbaes routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_WBAES_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using wbaesFunc = decltype(&::__name); \
std::call_once(wbaes_dso_flag, []() { \
wbaes_dso_handle = paddle::platform::dynload::GetWBAESDsoHandle(); \
}); \
static void *p_##__name = dlsym(wbaes_dso_handle, #__name); \
return reinterpret_cast<wbaesFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_WBAES_WRAP(__name) DYNAMIC_LOAD_WBAES_WRAP(__name)
#define WBAES_ROUTINE_EACH(__macro) __macro(GSECF);
WBAES_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WBAES_WRAP);
#undef DYNAMIC_LOAD_WBAES_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor imperative_profiler)
tracer analysis_predictor imperative_profiler nccl_context)
if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
......
......@@ -50,11 +50,15 @@ void BindDataset(py::module* m) {
.def("set_filelist", &framework::Dataset::SetFileList)
.def("set_thread_num", &framework::Dataset::SetThreadNum)
.def("set_trainer_num", &framework::Dataset::SetTrainerNum)
.def("set_fleet_send_batch_size",
&framework::Dataset::SetFleetSendBatchSize)
.def("set_hdfs_config", &framework::Dataset::SetHdfsConfig)
.def("set_data_feed_desc", &framework::Dataset::SetDataFeedDesc)
.def("get_filelist", &framework::Dataset::GetFileList)
.def("get_thread_num", &framework::Dataset::GetThreadNum)
.def("get_trainer_num", &framework::Dataset::GetTrainerNum)
.def("get_fleet_send_batch_size",
&framework::Dataset::GetFleetSendBatchSize)
.def("get_hdfs_config", &framework::Dataset::GetHdfsConfig)
.def("get_data_feed_desc", &framework::Dataset::GetDataFeedDesc)
.def("register_client2client_msg_handler",
......
......@@ -29,7 +29,7 @@ namespace paddle {
namespace pybind {
// Bind Methods
void BindTracer(pybind11::module* m) {
void BindImperative(pybind11::module* m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__",
[](imperative::Tracer& self, framework::BlockDesc* root_block) {
......@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
})
.def("py_trace", &imperative::Tracer::PyTrace,
pybind11::return_value_policy::take_ownership);
// define parallel context
pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
*m, "ParallelStrategy", "");
parallel_strategy.def(pybind11::init())
.def_property(
"nranks",
[](const imperative::ParallelStrategy& self) { return self.nranks_; },
[](imperative::ParallelStrategy& self, int nranks) {
self.nranks_ = nranks;
})
.def_property("local_rank",
[](const imperative::ParallelStrategy& self) {
return self.local_rank_;
},
[](imperative::ParallelStrategy& self, int local_rank) {
self.local_rank_ = local_rank;
})
.def_property(
"trainer_endpoints",
[](const imperative::ParallelStrategy& self) {
return self.trainer_endpoints_;
},
[](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
self.trainer_endpoints_ = eps;
})
.def_property("current_endpoint",
[](const imperative::ParallelStrategy& self) {
return self.current_endpoint_;
},
[](imperative::ParallelStrategy& self,
const std::string& ep) { self.current_endpoint_ = ep; });
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
*m, "NCCLParallelContext");
nccl_ctx
.def(pybind11::init<const imperative::ParallelStrategy&,
const platform::CUDAPlace&>())
.def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
#endif
}
} // namespace pybind
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
......@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
using imperative::VarBase::VarBase; // Inherit constructors
};
void BindTracer(pybind11::module* m);
void BindImperative(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
......@@ -165,6 +166,11 @@ PYBIND11_MODULE(core, m) {
// to enable eager deletion mode in unittest.
m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
m.def("_set_fuse_parameter_group_size",
&paddle::framework::details::SetFuseParameterGroupsSize);
m.def("_set_fuse_parameter_memory_size",
&paddle::framework::details::SetFuseParameterMemorySize);
m.add_object("_cleanup",
py::capsule([]() { ScopePool::Instance().Clear(); }));
......@@ -288,7 +294,7 @@ PYBIND11_MODULE(core, m) {
})
.def_static("num_funcs", &imperative::PyLayer::NumFuncs);
BindTracer(&m);
BindImperative(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer(
......@@ -1356,6 +1362,10 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_all_reduce_ops",
[](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
[](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
.def_property(
"cache_runtime_context",
[](const BuildStrategy &self) { return self.cache_runtime_context_; },
[](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
.def("_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true);
......
......@@ -32,6 +32,7 @@ default_envs = {
"NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0",
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
}
GPUS = 8
......
......@@ -241,11 +241,13 @@ class InMemoryDataset(DatasetBase):
fleet: fleet singleton. Default None.
"""
trainer_num = 1
fleet_send_batch_size = 80000
if fleet is not None:
fleet.fleet_instance.role_maker_._barrier_worker()
trainer_num = fleet.worker_num()
self.dataset.register_client2client_msg_handler()
self.dataset.set_trainer_num(trainer_num)
self.dataset.set_fleet_send_batch_size(fleet_send_batch_size)
if fleet is not None:
fleet.fleet_instance.role_maker_._barrier_worker()
self.dataset.global_shuffle()
......
......@@ -29,6 +29,9 @@ from .tracer import *
from . import profiler
from .profiler import *
from . import parallel
from .parallel import *
from . import checkpoint
from .checkpoint import *
......@@ -41,5 +44,6 @@ __all__ += base.__all__
__all__ += nn.__all__
__all__ += tracer.__all__
__all__ += profiler.__all__
__all__ += parallel.__all__
__all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__
......@@ -151,7 +151,7 @@ class Conv2D(layers.Layer):
bias_attr=None,
dtype=core.VarDesc.VarType.FP32):
assert param_attr is not False, "param_attr should not be False here."
super(Conv2D, self).__init__(name_scope)
super(Conv2D, self).__init__(name_scope, dtype)
self._groups = groups
self._stride = utils.convert_to_list(stride, 2, 'stride')
self._padding = utils.convert_to_list(padding, 2, 'padding')
......@@ -860,7 +860,7 @@ class FC(layers.Layer):
num_flatten_dims=1,
dtype=core.VarDesc.VarType.FP32,
act=None):
super(FC, self).__init__(name_scope)
super(FC, self).__init__(name_scope, dtype)
self._size = size
self._num_flatten_dims = num_flatten_dims
......@@ -1050,7 +1050,7 @@ class BatchNorm(layers.Layer):
do_model_average_for_mean_and_var=False,
fuse_with_relu=False,
use_global_stats=False):
super(BatchNorm, self).__init__(name_scope)
super(BatchNorm, self).__init__(name_scope, dtype)
self._param_attr = param_attr
self._param_attr = bias_attr
self._act = act
......@@ -1202,7 +1202,7 @@ class Embedding(layers.Layer):
param_attr=None,
dtype='float32'):
super(Embedding, self).__init__(name_scope)
super(Embedding, self).__init__(name_scope, dtype)
self._size = size
self._is_sparse = is_sparse
self._is_distributed = is_distributed
......@@ -1450,7 +1450,7 @@ class GRUUnit(layers.Layer):
gate_activation='sigmoid',
origin_mode=False,
dtype='float32'):
super(GRUUnit, self).__init__(name_scope)
super(GRUUnit, self).__init__(name_scope, dtype)
activation_dict = dict(
identity=0,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except jin compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .. import core
__all__ = ["prepare_context"]
ParallelStrategy = core.ParallelStrategy
__parallel_ctx__clz__ = None
def prepare_context(parallel_strategy, place):
global __parallel_ctx__clz__
assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
if isinstance(place, core.CUDAPlace):
__parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
place)
else:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert ("Only support CUDAPlace for now.")
__parallel_ctx__clz__.init()
class Env(object):
def __init__(self):
self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
@property
def nranks(self):
return self._nranks
@property
def local_rank(self):
return self._local_rank
@property
def dev_id(self):
return self._dev_id
@property
def current_endpoint(self):
return self._current_endpoint
......@@ -712,7 +712,7 @@ class Executor(object):
if dataset == None:
raise RuntimeError("dataset is needed and should be initialized")
if self.place == paddle.fluid.CUDAPlace():
if not isinstance(self.place, core.CPUPlace):
raise RuntimeError("infer_from_dataset is verified on CPUPlace"
"We will open CUDAPlace in the future")
......@@ -796,7 +796,7 @@ class Executor(object):
if dataset == None:
raise RuntimeError("dataset is need and should be initialized")
if self.place == paddle.fluid.CUDAPlace():
if not isinstance(self.place, core.CPUPlace):
raise RuntimeError("train_from_dataset is verified on CPUPlace"
"We will open CUDAPlace in the future")
......
......@@ -123,18 +123,25 @@ class Fleet(object):
print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1)
def init_worker(self, programs):
def init_worker(self, programs, scopes=None):
"""
init_worker(): will be called by user. When a user knows current process is_server(), he/she
should call init_worker() to initialize global information about worker and connect
worker with pserver.
worker with pserver. You should run startup program before init_worker.
Args:
programs(Program|list): a Program or a list of Programs
scopes(Scope|list): a Scope or a list of Scopes, default None.
"""
if not isinstance(programs, list):
programs = [programs]
if scopes is None:
scopes = [fluid.global_scope()] * len(programs)
if len(scopes) != len(programs):
print(
"You should make sure len(scopes) == len(programs) or set scopes None"
)
sys.exit(-1)
if self._opt_info:
if "fleet_desc" in self._opt_info:
self._dist_desc_str = text_format.MessageToString(
......@@ -160,7 +167,7 @@ class Fleet(object):
self.role_maker_._barrier_worker()
if self.role_maker_._is_first_worker():
tables = self._dist_desc.trainer_param.dense_table
for prog in programs:
for prog, scope in zip(programs, scopes):
prog_id = str(id(prog))
prog_conf = self._opt_info['program_configs'][prog_id]
prog_tables = {}
......@@ -174,10 +181,16 @@ class Fleet(object):
continue
var_name_list = []
for i in range(0, len(table.dense_variable_name)):
var_name_list.append(table.dense_variable_name[i])
self._fleet_ptr.init_model(prog.desc,
int(table.table_id),
var_name_list)
var_name = table.dense_variable_name[i]
if scope.find_var(var_name) is None:
print("var " + var_name +
" not found in scope, " +
"you should run startup program first")
sys.exit(-1)
var_name_list.append(var_name)
self._fleet_ptr.init_model(scope,
int(table.table_id),
var_name_list)
# barrier for init model done
self.role_maker_._barrier_worker()
else:
......
......@@ -191,6 +191,7 @@ __all__ = [
'kldiv_loss',
'tree_conv',
'npair_loss',
'pixel_shuffle',
'fsp_matrix',
]
......@@ -480,6 +481,8 @@ def dynamic_lstm(input,
forward, _ = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
"""
assert _in_dygraph_mode(
) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstm', **locals())
size = size // 4
......@@ -864,6 +867,9 @@ def dynamic_lstmp(input,
proj_activation="tanh")
"""
assert _in_dygraph_mode(
) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstmp', **locals())
size = size // 4
......@@ -1035,6 +1041,9 @@ def dynamic_gru(input,
hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
"""
assert _in_dygraph_mode(
) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
helper = LayerHelper('gru', **locals())
dtype = helper.input_dtype()
......@@ -1751,6 +1760,8 @@ def sequence_conv(input,
Variable: output of sequence_conv
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype()
filter_shape = [filter_size * input.shape[1], num_filters]
......@@ -1810,6 +1821,8 @@ def sequence_softmax(input, use_cudnn=False, name=None):
dtype='float32', lod_level=1)
x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_softmax', **locals())
dtype = helper.input_dtype()
softmax_out = helper.create_variable_for_type_inference(dtype)
......@@ -2302,6 +2315,8 @@ def sequence_pool(input, pool_type, is_test=False):
last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_pool', **locals())
dtype = helper.input_dtype()
pool_out = helper.create_variable_for_type_inference(dtype)
......@@ -2341,6 +2356,8 @@ def sequence_concat(input, name=None):
out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_concat', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op(
......@@ -2468,6 +2485,8 @@ def sequence_slice(input, offset, length, name=None):
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper("sequence_slice", **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -3927,6 +3946,8 @@ def sequence_expand(x, y, ref_level=-1, name=None):
dtype='float32', lod_level=1)
out = layers.sequence_expand(x=x, y=y, ref_level=0)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_expand', input=x, **locals())
dtype = helper.input_dtype()
tmp = helper.create_variable_for_type_inference(dtype)
......@@ -3993,6 +4014,8 @@ def sequence_expand_as(x, y, name=None):
dtype='float32', lod_level=1)
out = layers.sequence_expand_as(x=x, y=y)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_expand_as', input=x, **locals())
dtype = helper.input_dtype()
tmp = helper.create_variable_for_type_inference(dtype)
......@@ -4039,6 +4062,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_pad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -4105,6 +4130,8 @@ def sequence_unpad(x, length, name=None):
out = fluid.layers.sequence_unpad(x=x, length=len)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_unpad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -5278,6 +5305,8 @@ def sequence_reshape(input, new_dim):
x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_reshape', **locals())
out = helper.create_variable_for_type_inference(helper.input_dtype())
helper.append_op(
......@@ -5812,6 +5841,8 @@ def im2sequence(input,
input=layer, stride=[1, 1], filter_size=[2, 2])
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
if isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
......@@ -6228,7 +6259,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
},
outputs={'Diff': diff,
'Out': loss},
attrs={'sigma': sigma})
attrs={'sigma': sigma if sigma is not None else 1.0})
return loss
......@@ -7589,6 +7620,8 @@ def sequence_scatter(input, index, updates, name=None):
output = fluid.layers.sequence_scatter(input, index, updates)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_scatter', **locals())
dtype = helper.input_dtype()
out = helper.create_variable_for_type_inference(dtype)
......@@ -8677,6 +8710,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_enumerate', **locals())
out = helper.create_variable_for_type_inference(
helper.input_dtype(), stop_gradient=True)
......@@ -8716,6 +8751,8 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
Variable: The output sequence mask.
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper('sequence_mask', **locals())
if name is None:
......@@ -9766,6 +9803,8 @@ def sequence_reverse(x, name=None):
Returns:
out(${y_type}): ${y_comment}
"""
assert not _in_dygraph_mode(), (
"sequence layer is not supported in dygraph mode yet.")
helper = LayerHelper("sequence_reverse", **locals())
if name is None:
out = helper.create_variable_for_type_inference(dtype=x.dtype)
......@@ -10923,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
return l2loss + celoss
def pixel_shuffle(x, upscale_factor):
"""
**Pixel Shuffle Layer**
This layer rearranges elements in a tensor of shape [N, C, H, W]
to a tensor of shape [N, C/r**2, H*r, W*r].
This is useful for implementing efficient sub-pixel convolution
with a stride of 1/r.
Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
by Shi et. al (2016) for more details.
.. code-block:: text
Given a 4-D tensor with the shape:
x.shape = [1, 9, 4, 4]
Given upscale_factor:
upscale_factor= 3
output shape is:
[1, 1, 12, 12]
Args:
x(Variable): The input tensor variable.
upscale_factor(int): factor to increase spatial resolution
Returns:
Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
Raises:
ValueError: If the square of upscale_factor cannot divide the channels of input.
Examples:
.. code-block:: python
input = fluid.layers.data(shape=[9,4,4])
output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
"""
helper = LayerHelper("pixel_shuffle", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
if not isinstance(upscale_factor, int):
raise TypeError("upscale factor must be int type")
helper.append_op(
type="pixel_shuffle",
inputs={"X": x},
outputs={"Out": out},
attrs={"upscale_factor": upscale_factor})
return out
def fsp_matrix(x, y):
"""
......
......@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
force_cpu=True)
for param_var, grad_var in param_and_grads:
var_numel = reduce(lambda x, y: x * y, param_var.shape)
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
if var_numel < 16384 or \
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
......
......@@ -104,6 +104,7 @@ class ParallelExecutor(object):
self._scope = scope if scope is not None else executor.global_scope()
if main_program is not None and main_program._enable_dgc:
assert num_trainers > 1
assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
assert num_trainers * len(
self._places) > 1, "dgc is not useful for single card training"
......@@ -123,6 +124,11 @@ class ParallelExecutor(object):
exec_strategy=exec_strategy,
share_vars_from=share_vars_from._compiled_program
if share_vars_from else None)
# FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
if main_program._enable_dgc:
self._compiled_program._build_strategy.is_distribution = True
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
self._exe = executor.Executor(self._place)
self._compiled_program._compile(place=self._place, scope=self._scope)
......
......@@ -118,6 +118,7 @@ endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 740)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
if(NOT WIN32)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import numpy as np
def simple_fc_net(use_feed=None):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed=None):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def init_data(batch_size=32, img_shape=[784], label_range=9):
np.random.seed(5)
assert isinstance(img_shape, list)
input_shape = [batch_size] + img_shape
img = np.random.random(size=input_shape).astype(np.float32)
label = np.array(
[np.random.randint(0, label_range) for _ in range(batch_size)]).reshape(
(-1, 1)).astype("int64")
return img, label
......@@ -29,7 +29,6 @@ class TestDataset(unittest.TestCase):
def test_dataset_create(self):
""" Testcase for dataset create. """
return
try:
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
except:
......@@ -48,7 +47,6 @@ class TestDataset(unittest.TestCase):
def test_dataset_config(self):
""" Testcase for dataset configuration. """
return
dataset = fluid.core.Dataset("MultiSlotDataset")
dataset.set_thread_num(12)
dataset.set_filelist(["a.txt", "b.txt", "c.txt"])
......@@ -75,7 +73,6 @@ class TestDataset(unittest.TestCase):
"""
Testcase for InMemoryDataset from create to run.
"""
return
with open("test_in_memory_dataset_run_a.txt", "w") as f:
data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
......@@ -112,9 +109,10 @@ class TestDataset(unittest.TestCase):
for i in range(2):
try:
exe.train_from_dataset(fluid.default_main_program(), dataset)
except:
#self.assertTrue(False)
except ImportError as e:
pass
except Exception as e:
self.assertTrue(False)
os.remove("./test_in_memory_dataset_run_a.txt")
os.remove("./test_in_memory_dataset_run_b.txt")
......@@ -123,7 +121,6 @@ class TestDataset(unittest.TestCase):
"""
Testcase for QueueDataset from create to run.
"""
return
with open("test_queue_dataset_run_a.txt", "w") as f:
data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
......@@ -156,15 +153,14 @@ class TestDataset(unittest.TestCase):
for i in range(2):
try:
exe.train_from_dataset(fluid.default_main_program(), dataset)
except:
#self.assertTrue(False)
except ImportError as e:
pass
except Exception as e:
self.assertTrue(False)
os.remove("./test_queue_dataset_run_a.txt")
os.remove("./test_queue_dataset_run_b.txt")
if __name__ == '__main__':
#unittest.main()
import sys
sys.exit(0)
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
from test_conditional_block import *
if __name__ == '__main__':
unittest.main()
......@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -22,45 +22,6 @@ import unittest
import os
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
......@@ -75,10 +36,10 @@ class TestMNIST(TestParallelExecutorBase):
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True):
def _compare_fuse_all_reduce_ops(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
img, label = init_data()
def _optimizer(learning_rate=1e-6):
optimizer = fluid.optimizer.SGD(
......
......@@ -12,108 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fuse_elewise_add_act_ops(self,
model,
use_cuda,
random_data=True):
def _compare_fuse_elewise_add_act_ops(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
img, label = init_data()
def _optimizer(learning_rate=1e-6):
optimizer = fluid.optimizer.SGD(
......
......@@ -11,78 +11,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestFuseAdamOps(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fused_optimizer_ops(self,
model,
use_cuda,
random_data=True,
optimizer=fluid.optimizer.Adam):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
img, label = init_data()
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
......@@ -111,7 +59,7 @@ class TestFuseAdamOps(TestParallelExecutorBase):
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fused_optimizer_ops(fc_with_batchnorm, True)
# self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
class TestFuseSGDOps(TestFuseAdamOps):
......
......@@ -56,7 +56,7 @@ def optimizer_setting(params):
#bd = [step * e for e in ls["epochs"]]
#base_lr = params["lr"]
#lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
return optimizer
......@@ -109,7 +109,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
size=num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
act='relu')
act='sigmoid')
def forward(self, input):
y = self._pool(input)
......@@ -316,6 +316,7 @@ class TestImperativeResneXt(unittest.TestCase):
batch_size = train_parameters["batch_size"]
batch_num = 2
epoch_num = 1
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
......@@ -327,52 +328,54 @@ class TestImperativeResneXt(unittest.TestCase):
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
batch_size=batch_size,
drop_last=True)
dy_param_init_value = {}
for param in se_resnext.parameters():
dy_param_init_value[param.name] = param.numpy()
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label.stop_gradient = True
out = se_resnext(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy()
if batch_id == 0:
for epoch_id in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num and batch_num != -1:
break
dy_x_data = np.array(
[x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label.stop_gradient = True
out = se_resnext(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy()
if batch_id == 0:
for param in se_resnext.parameters():
if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
#dy_grad_value = {}
#for param in se_resnext.parameters():
# if param.trainable:
# np_array = np.array(param._ivar._grad_ivar().value()
# .get_tensor())
# dy_grad_value[param.name + core.grad_var_suffix()] = np_array
optimizer.minimize(avg_loss)
se_resnext.clear_gradients()
dy_param_value = {}
for param in se_resnext.parameters():
if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward()
dy_grad_value = {}
for param in se_resnext.parameters():
if param.trainable:
np_array = np.array(param._ivar._grad_ivar().value()
.get_tensor())
dy_grad_value[param.name + core.grad_var_suffix(
)] = np_array
optimizer.minimize(avg_loss)
se_resnext.clear_gradients()
dy_param_value = {}
for param in se_resnext.parameters():
dy_param_value[param.name] = param.numpy()
dy_param_value[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
......@@ -389,7 +392,8 @@ class TestImperativeResneXt(unittest.TestCase):
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
batch_size=batch_size,
drop_last=True)
img = fluid.layers.data(
name='pixel', shape=[3, 224, 224], dtype='float32')
......@@ -415,37 +419,42 @@ class TestImperativeResneXt(unittest.TestCase):
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
static_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
[batch_size, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
fetch_list.extend(static_grad_name_list)
out = exe.run(fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_grad_value = {}
static_out = out[0]
param_start_pos = 1
grad_start_pos = len(static_param_name_list) + param_start_pos
for i in range(param_start_pos,
len(static_param_name_list) + param_start_pos):
static_param_value[static_param_name_list[
i - param_start_pos]] = out[i]
for i in range(grad_start_pos,
len(static_grad_name_list) + grad_start_pos):
static_grad_value[static_grad_name_list[
i - grad_start_pos]] = out[i]
for epoch_id in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num and batch_num != -1:
break
static_x_data = np.array(
[x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(
[batch_size, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
fetch_list.extend(static_grad_name_list)
out = exe.run(
fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_grad_value = {}
static_out = out[0]
param_start_pos = 1
grad_start_pos = len(
static_param_name_list) + param_start_pos
for i in range(
param_start_pos,
len(static_param_name_list) + param_start_pos):
static_param_value[static_param_name_list[
i - param_start_pos]] = out[i]
for i in range(grad_start_pos,
len(static_grad_name_list) + grad_start_pos):
static_grad_value[static_grad_name_list[
i - grad_start_pos]] = out[i]
self.assertTrue(np.allclose(static_out, dy_out))
self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
......@@ -454,12 +463,12 @@ class TestImperativeResneXt(unittest.TestCase):
self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_grad_value), len(static_grad_value))
for key, value in six.iteritems(static_grad_value):
self.assertTrue(np.allclose(value, dy_grad_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
# FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake
#self.assertEqual(len(dy_grad_value), len(static_grad_value))
#for key, value in six.iteritems(static_grad_value):
# self.assertTrue(np.allclose(value, dy_grad_value[key]))
# self.assertTrue(np.isfinite(value.all()))
# self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_param_value), len(static_param_value))
for key, value in six.iteritems(static_param_value):
......
......@@ -304,7 +304,7 @@ use_py_reader = False
sync = False
# how many batches we use
batch_num = 50
batch_num = 5
np.random.seed = 1
src_word_np = np.random.randint(
......
......@@ -21,25 +21,8 @@ import os
os.environ['FLAGS_enable_parallel_graph'] = str(1)
import paddle.fluid.core as core
import os
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
from simple_nets import simple_fc_net, init_data
class TestMNIST(TestParallelExecutorBase):
......@@ -47,19 +30,12 @@ class TestMNIST(TestParallelExecutorBase):
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self):
np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32)
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
img, label = init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
......@@ -75,8 +51,7 @@ class TestMNIST(TestParallelExecutorBase):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
img, label = init_data()
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1,
......
......@@ -14,18 +14,21 @@
from __future__ import print_function
import os
os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
import paddle.fluid as fluid
fluid.core._set_fuse_parameter_group_size(3)
fluid.core._set_fuse_parameter_memory_size(131072)
import paddle.fluid.layers.ops as ops
from paddle.fluid.initializer import init_on_cpu
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
import paddle.fluid.core as core
from parallel_executor_test_base import TestParallelExecutorBase
from simple_nets import init_data
import unittest
import math
import numpy as np
from functools import partial
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
# and Executor is different. Because, for ParallelExecutor, the dropout_op of
......@@ -187,17 +190,6 @@ class TestResnet(TestParallelExecutorBase):
remove_dropout = False
remove_bn = False
def _init_data(self, batch_size=2, random=True):
np.random.seed(5)
if random:
img = np.random.random(
size=[batch_size] + img_shape).astype(np.float32)
else:
img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
label = [np.random.randint(0, 999) for _ in range(batch_size)]
label = np.array(label).astype(np.int64).reshape(-1, 1)
return img, label
def _compare_reduce_and_allreduce(self,
model,
use_cuda,
......@@ -209,7 +201,8 @@ class TestResnet(TestParallelExecutorBase):
global remove_bn
remove_bn = True
img, label = self._init_data(batch_size=batch_size)
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=999)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
......@@ -276,10 +269,12 @@ class TestResnet(TestParallelExecutorBase):
def _check_resnet_convergence(self,
model,
use_cuda=True,
use_reduce=False,
check_func_1,
check_func_2,
use_cuda,
iter=20,
delta2=1e-5):
delta2=1e-5,
compare_seperately=True):
if use_cuda and not core.is_compiled_with_cuda():
return
......@@ -288,31 +283,33 @@ class TestResnet(TestParallelExecutorBase):
remove_dropout = True
remove_bn = True
img, label = self._init_data(batch_size=batch_size)
single_first_loss, single_last_loss = self.check_network_convergence(
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=999)
func_1_first_loss, func_1_last_loss = check_func_1(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
use_reduce=use_reduce,
optimizer=optimizer,
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
use_cuda=use_cuda)
func_2_first_loss, func_2_last_loss = check_func_2(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
use_reduce=use_reduce,
optimizer=optimizer)
use_cuda=use_cuda)
self.assertAlmostEquals(
np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5)
self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
if compare_seperately:
for loss in zip(func_1_first_loss, func_2_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(func_1_last_loss, func_2_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
else:
self.assertAlmostEquals(
np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
self.assertAlmostEquals(
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
def _compare_with_fused_all_reduce(self,
model,
......@@ -325,7 +322,8 @@ class TestResnet(TestParallelExecutorBase):
global remove_bn
remove_bn = True
img, label = self._init_data(batch_size=batch_size)
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=999)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
......@@ -350,11 +348,6 @@ class TestResnet(TestParallelExecutorBase):
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
def test_seresnext_with_learning_rate_decay(self):
self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
self._check_resnet_convergence(
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce(
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
......@@ -367,6 +360,50 @@ class TestResnet(TestParallelExecutorBase):
self._compare_with_fused_all_reduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
def test_seresnext_with_learning_rate_decay(self):
check_func_1 = partial(
self.check_network_convergence,
optimizer=optimizer,
use_parallel_executor=True)
check_func_2 = partial(
self.check_network_convergence,
optimizer=optimizer,
use_parallel_executor=False)
self._check_resnet_convergence(
SE_ResNeXt50Small,
check_func_1,
check_func_2,
use_cuda=True,
compare_seperately=False)
self._check_resnet_convergence(
SE_ResNeXt50Small,
check_func_1,
check_func_2,
use_cuda=False,
compare_seperately=False,
iter=2,
delta2=1e-3)
def test_seresnext_with_fused_optimizer_ops(self):
check_func_1 = partial(
self.check_network_convergence, fuse_all_optimizer_ops=False)
check_func_2 = partial(
self.check_network_convergence, fuse_all_optimizer_ops=True)
# TODO(zcd): this test failed random, I will fix it in next PR.
# self._check_resnet_convergence(
# SE_ResNeXt50Small,
# check_func_1,
# check_func_2,
# use_cuda=True,
# delta2=1e-3)
self._check_resnet_convergence(
SE_ResNeXt50Small,
check_func_1,
check_func_2,
use_cuda=False,
iter=2,
delta2=1e-3)
if __name__ == '__main__':
unittest.main()
......@@ -13,7 +13,7 @@
# limitations under the License.
from __future__ import print_function
from simple_nets import simple_fc_net
import paddle.fluid as fluid
from paddle.fluid import compiler
import paddle.fluid.core as core
......@@ -24,23 +24,6 @@ import sys
import math
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def check_network_convergence(self, use_cuda, build_strategy=None):
os.environ['CPU_NUM'] = str(4)
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
from simple_nets import simple_fc_net
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import compiler
......@@ -24,23 +25,6 @@ import sys
import math
def simple_fc_net():
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestPassBuilder(unittest.TestCase):
def check_network_convergence(self, use_cuda, build_strategy=None):
os.environ['CPU_NUM'] = str(4)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestPixelShuffle(OpTest):
def setUp(self):
self.op_type = "pixel_shuffle"
n, c, h, w = 2, 9, 4, 4
up_factor = 3
shape = [n, c, h, w]
x = np.random.random(shape).astype("float32")
new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
w)
# reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
npresult = np.reshape(x, new_shape)
# transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
npresult = np.reshape(npresult, oshape)
self.inputs = {'X': x}
self.outputs = {'Out': npresult}
self.attrs = {'upscale_factor': up_factor}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
......@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .trainer_desc import MultiTrainer, DistMultiTrainer
from .device_worker import Hogwild, DownpourSGD
__all__ = ["TrainerFactory"]
......@@ -20,8 +23,6 @@ class TrainerFactory(object):
pass
def _create_trainer(self, opt_info=None):
from .trainer_desc import MultiTrainer, DistMultiTrainer
from .device_worker import Hogwild, DownpourSGD
trainer = None
device_worker = None
if opt_info == None:
......
......@@ -157,10 +157,6 @@ package_data['paddle.libs']= []
package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_WBAES}' == 'ON':
package_data['paddle.libs'] += ['libwbaes' + ext_name]
shutil.copy('${WBAES_SHARED_LIB}', libs_path)
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_LIB}', libs_path)
shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册