提交 d0c9f7e8 编写于 作者: S smallv0221

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into yxp0925

Merge branch 'develop'.
...@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME) ...@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
set(proto_hdrs) set(proto_hdrs)
paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
add_dependencies(extern_xxhash ${TARGET_NAME})
endfunction() endfunction()
function(py_proto_compile TARGET_NAME) function(py_proto_compile TARGET_NAME)
......
...@@ -39,6 +39,7 @@ set(third_party_deps) ...@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY} # REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG} # TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR}) # DIR ${TARGET_SOURCE_DIR})
FUNCTION(cache_third_party TARGET) FUNCTION(cache_third_party TARGET)
SET(options "") SET(options "")
SET(oneValueArgs URL REPOSITORY TAG DIR) SET(oneValueArgs URL REPOSITORY TAG DIR)
...@@ -269,6 +270,10 @@ if(WITH_PSLIB) ...@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -276,10 +281,6 @@ if(WITH_BOX_PS) ...@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
......
...@@ -74,7 +74,9 @@ class PullDenseWorker { ...@@ -74,7 +74,9 @@ class PullDenseWorker {
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) { void AddPlace(const paddle::platform::Place place) {
places_.push_back(place); places_.push_back(place);
} }
...@@ -135,9 +137,9 @@ class PullDenseWorker { ...@@ -135,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_; std::vector<cudaStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_; std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_; std::vector<Scope*> thread_scopes_;
#endif
}; };
// should incorporate different type of device // should incorporate different type of device
...@@ -161,6 +163,7 @@ class DeviceWorker { ...@@ -161,6 +163,7 @@ class DeviceWorker {
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {} virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void GetXpuOpIndex() {}
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
} }
......
...@@ -127,6 +127,7 @@ message DistributedStrategy { ...@@ -127,6 +127,7 @@ message DistributedStrategy {
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ]; optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ]; optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ]; optional bool adaptive_localsgd = 24 [ default = false ];
optional bool fp16_allreduce = 25 [ default = false ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
......
...@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync( ...@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status->push_back(std::move(status)); push_sparse_status->push_back(std::move(status));
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place) {
#ifdef PADDLE_WITH_PSLIB
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
sizeof(float) * count);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
#endif
}
#endif #endif
void FleetWrapper::PushDenseVarsAsync( void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
......
...@@ -160,6 +160,14 @@ class FleetWrapper { ...@@ -160,6 +160,14 @@ class FleetWrapper {
float scale_datanorm, int batch_size, float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream, const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event); cudaEvent_t event);
#endif
#ifdef PADDLE_WITH_XPU
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place);
#endif #endif
void PushDenseVarsAsync( void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
......
...@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, ...@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if (platform::is_cpu_place(tensor->place())) { if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(), memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
#ifdef PADDLE_WITH_CUDA
} else { } else {
#ifdef PADDLE_WITH_CUDA
memory::Copy(platform::CPUPlace(), data_ptr, memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(), tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr); tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#else
}
#endif #endif
#ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
} }
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope, void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var, const VariableMessage& req_var,
platform::Place place, platform::Place place,
cudaStream_t stream) { cudaStream_t stream) {
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else #else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void HeterWrapper::DeSerializeToTensor(Scope* scope, void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var, const VariableMessage& req_var,
platform::Place place) { platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars(); // const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname()); auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>(); auto* tensor = var->GetMutable<LoDTensor>();
...@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, ...@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void* tensor_data = void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type())); tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(), platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream); tensor->numel() * SizeOfType(tensor->type()));
#else #else
memcpy(tensor_data, req_var.data().data(), memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type())); tensor->numel() * SizeOfType(tensor->type()));
...@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType( ...@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case VariableMessage::BOOL: case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT return framework::proto::VarType::BOOL; // NOLINT
default: default:
VLOG(0) << "Not support type " << type; PADDLE_THROW(platform::errors::InvalidArgument(
"ToVarType:Unsupported type %d", type));
} }
} }
......
...@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
int place_num = trainer_desc.worker_places_size(); int place_num = trainer_desc.worker_places_size();
for (int i = 0; i < place_num; ++i) { for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i); int num = trainer_desc.worker_places(i);
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device); platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream; cudaStream_t stream;
...@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
events_.push_back(event); events_.push_back(event);
#endif
#ifdef PADDLE_WITH_XPU
platform::XPUPlace place = platform::XPUPlace(num);
places_.push_back(place);
#endif
} }
// thread_num_ = trainer_desc.thread_num(); // thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset); // SetDataset(dataset);
...@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
auto place = places_[num]; auto place = places_[num];
Scope* scope = place_scopes_[num]; Scope* scope = place_scopes_[num];
#ifdef PADDLE_WITH_CUDA
auto stream = copy_streams_[num]; auto stream = copy_streams_[num];
auto event = events_[num]; auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
#endif
auto& block = program.Block(0); auto& block = program.Block(0);
for (auto& var : block.AllVars()) { for (auto& var : block.AllVars()) {
if (var->Persistable()) { if (var->Persistable()) {
...@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { ...@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \ HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \ } \
} while (0) } while (0)
#define HeterMemcpyXpuFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
} \
} while (0)
#ifdef PADDLE_WITH_CUDA
_ForEachDataType_(HeterMemcpyFunc); _ForEachDataType_(HeterMemcpyFunc);
#endif
#ifdef PADDLE_WITH_XPU
_ForEachDataType_(HeterMemcpyXpuFunc);
#endif
} }
} }
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event); cudaEventSynchronize(event);
#endif
} }
#ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor, LoDTensor* root_tensor,
...@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, ...@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
root_ptr, sizeof(T) * root_tensor->numel(), stream); root_ptr, sizeof(T) * root_tensor->numel(), stream);
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel());
} else {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel());
}
}
#endif
void HeterXpuTrainer::DumpWork(int tid) {} void HeterXpuTrainer::DumpWork(int tid) {}
...@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
CreateThreadParam(main_program, i); CreateThreadParam(main_program, i);
pull_dense_worker_->AddThreadScope(scope); pull_dense_worker_->AddThreadScope(scope);
pull_dense_worker_->AddPlace(places_[i]); pull_dense_worker_->AddPlace(places_[i]);
#ifdef PADDLE_WITH_CUDA
pull_dense_worker_->AddStream(copy_streams_[i]); pull_dense_worker_->AddStream(copy_streams_[i]);
#endif
} }
pull_dense_worker_->Start(); pull_dense_worker_->Start();
#ifdef PADDLE_WITH_CUDA
for (auto& stream : copy_streams_) { for (auto& stream : copy_streams_) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
#endif
op_names_.clear(); op_names_.clear();
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc); std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
...@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { ...@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
OperatorBase* local_op_ptr = local_op.release(); OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
object_pool_.Push(context); object_pool_.Push(context);
} }
} }
...@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
} \ } \
} while (0) } while (0)
_ForEachDataType_(MergeCallback); _ForEachDataType_(MergeCallback);
if (platform::is_gpu_place(thread_tensor->place())) { if (!platform::is_cpu_place(thread_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device; BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data<void>(), 0, cudaMemset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type())); thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
} else { } else {
memset(thread_tensor->data<void>(), 0, memset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type())); thread_tensor->numel() * SizeOfType(thread_tensor->type()));
...@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, ...@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
auto* merge_var = response->add_vars(); auto* merge_var = response->add_vars();
heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_, heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
merge_var); merge_var);
if (platform::is_gpu_place(root_tensor->place())) { if (!platform::is_cpu_place(root_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id = auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device; BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data<void>(), 0, cudaMemset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type())); root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = root_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
} else { } else {
memset(root_tensor->data<void>(), 0, memset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type())); root_tensor->numel() * SizeOfType(root_tensor->type()));
...@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
OperatorBase* local_op_ptr = local_op.release(); OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr); (context->ops_).push_back(local_op_ptr);
} }
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
} }
context->Reset(); context->Reset();
...@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
auto deserial_timer = auto deserial_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial"); std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
for (int i = 0; i < request->vars_size(); ++i) { for (int i = 0; i < request->vars_size(); ++i) {
#ifdef PADDLE_WITH_CUDA
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place, heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
copy_streams_[context->place_num_]); copy_streams_[context->place_num_]);
#endif
#ifdef PADDLE_WITH_XPU
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
#endif
} }
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, copy_streams_[context->place_num_])); cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
while (cudaEventQuery(context->event_) != cudaSuccess) { while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel"; VLOG(3) << "wait for kernel";
bthread_yield(); bthread_yield();
} }
#endif
} }
{ {
...@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
op->Run(*(context->scope_), place); op->Run(*(context->scope_), place);
} }
} }
#ifdef PADDLE_WITH_CUDA
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>( auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
bthread_yield(); bthread_yield();
} }
} }
#endif
#ifdef PADDLE_WITH_XPU
xpu_wait();
#endif
for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) { for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
const std::string& varname = trainer_desc_.xpu_send_list(i); const std::string& varname = trainer_desc_.xpu_send_list(i);
...@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ...@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
++i) { ++i) {
uint64_t tid = uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i)); static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
#ifdef PADDLE_WITH_CUDA
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid], *(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(), &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_], copy_streams_[context->place_num_], places_[context->place_num_], copy_streams_[context->place_num_],
context->event_); context->event_);
#endif
#ifdef PADDLE_WITH_XPU
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_]);
#endif
} }
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) { ++i) {
...@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() { ...@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
pull_dense_worker_->Stop(); pull_dense_worker_->Stop();
root_scope_->DropKids(); root_scope_->DropKids();
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#endif #endif
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass, ...@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
REGISTER_PASS(conv_relu_mkldnn_fuse_pass, REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
paddle::framework::ir::ConvActivationFusePass); paddle::framework::ir::ConvActivationFusePass);
REGISTER_PASS_CAPABILITY(conv_relu_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("relu", 0));
REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass, REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
paddle::framework::ir::Conv2DLeakyReLUFusePass); paddle::framework::ir::Conv2DLeakyReLUFusePass);
REGISTER_PASS_CAPABILITY(conv_leaky_relu_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.LE("leaky_relu", 1));
REGISTER_PASS(conv_relu6_mkldnn_fuse_pass, REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
paddle::framework::ir::Conv2DReLU6FusePass); paddle::framework::ir::Conv2DReLU6FusePass);
REGISTER_PASS_CAPABILITY(conv_relu6_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("relu6", 0));
REGISTER_PASS(conv_swish_mkldnn_fuse_pass, REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
paddle::framework::ir::Conv2DSwishFusePass); paddle::framework::ir::Conv2DSwishFusePass);
REGISTER_PASS_CAPABILITY(conv_swish_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("swish", 0));
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass, REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
paddle::framework::ir::ConvConcatReLUFusePass); paddle::framework::ir::ConvConcatReLUFusePass);
REGISTER_PASS_CAPABILITY(conv_concat_relu_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("conv2d", 0)
.EQ("concat", 0)
.EQ("relu", 0));
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
#include <paddle/fluid/string/pretty_log.h> #include <paddle/fluid/string/pretty_log.h>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { ...@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS(matmul_transpose_reshape_fuse_pass, REGISTER_PASS(matmul_transpose_reshape_fuse_pass,
paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass); paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass);
REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("matmul", 0)
.EQ("transpose", 0)
.EQ("reshape", 0));
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
...@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(scale_matmul_fuse_pass, REGISTER_PASS(scale_matmul_fuse_pass,
paddle::framework::ir::ScaleMatmulFusePass); paddle::framework::ir::ScaleMatmulFusePass);
REGISTER_PASS_CAPABILITY(scale_matmul_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("scale", 0)
.EQ("matmul", 0));
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS(seq_concat_fc_fuse_pass, REGISTER_PASS(seq_concat_fc_fuse_pass,
paddle::framework::ir::SeqConcatFcFusePass); paddle::framework::ir::SeqConcatFcFusePass);
REGISTER_PASS_CAPABILITY(seq_concat_fc_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("sequence_expand", 0)
.EQ("concat", 0)
.EQ("mul", 0)
.EQ("elementwise_add", 0)
.EQ("sigmoid", 0)
.EQ("tanh", 0)
.EQ("relu", 0)
.EQ("identity", 0)
.EQ("fusion_seqexpand_concat_fc", 0));
...@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { ...@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
fleet_ptr_ = FleetWrapper::GetInstance(); fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
copy_streams_.clear(); copy_streams_.clear();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
places_.clear(); places_.clear();
thread_scopes_.clear(); thread_scopes_.clear();
#endif #endif
} }
void PullDenseWorker::CreatePinVar() { void PullDenseWorker::CreatePinVar() {
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
// for (auto& name : v.second) { // for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
...@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() { ...@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
auto* ptr = root_scope_->Var(name + "pin"); auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR); InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>(); LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
#ifdef PADDLE_WITH_CUDA
pin_tensor->mutable_data<float>(tensor->dims(), pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace()); platform::CUDAPinnedPlace());
#endif
#ifdef PADDLE_WITH_XPU
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
#endif
} }
} }
#endif #endif
...@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1); exit(-1);
} }
status_vec->resize(0); status_vec->resize(0);
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) { // for (auto& v : dense_value_names_) {
...@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { ...@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable* var = thread_scopes_[i]->FindVar(name); Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>(); float* w = tensor->data<float>();
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w, platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]); sizeof(float) * tensor->numel(), copy_streams_[i]);
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
platform::CPUPlace(), pin_w,
sizeof(float) * tensor->numel());
#endif
} }
} }
} }
...@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) { ...@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i)); dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) { if (force_update || CheckUpdateParam(tid)) {
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
VLOG(3) << "pull dense " << force_update << " " << tid; VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false); &pull_dense_status_, false);
......
...@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer { ...@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_; std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
}; };
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class HeterServiceContext { class HeterServiceContext {
public: public:
HeterServiceContext() {} HeterServiceContext() {}
...@@ -151,7 +152,9 @@ class HeterServiceContext { ...@@ -151,7 +152,9 @@ class HeterServiceContext {
void Reset() { push_dense_status_.clear(); } void Reset() { push_dense_status_.clear(); }
int place_num_; int place_num_;
Scope* scope_{nullptr}; Scope* scope_{nullptr};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_; cudaEvent_t event_;
#endif
std::vector<OperatorBase*> ops_; std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_; std::vector<::std::future<int32_t>> push_dense_status_;
}; };
...@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
virtual void CacheProgram(const ProgramDesc& main_program) { virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program); new (&program_) ProgramDesc(main_program);
} }
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
template <typename T> template <typename T>
#ifdef PADDLE_WITH_CUDA
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place, const paddle::platform::Place& thread_place,
cudaStream_t stream); cudaStream_t stream);
#endif
#ifdef PADDLE_WITH_XPU
void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place);
#endif
void CreateThreadParam(const ProgramDesc& program, int num); void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T> template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
...@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
std::vector<std::string> op_names_; std::vector<std::string> op_names_;
std::vector<Scope*> place_scopes_; std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_; BtObjectPool<HeterServiceContext> object_pool_;
std::vector<cudaStream_t> copy_streams_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<cudaEvent_t> events_; std::vector<cudaEvent_t> events_;
#endif
}; };
#endif #endif
......
...@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer( ...@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer); REGISTER_TRAINER_CLASS(HeterXpuTrainer);
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML) ...@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
# seq_pool1 # seq_pool1
set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
if(NOT WIN32) if(NOT WIN32)
set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150) set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120)
set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
endif() endif()
else() else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that # TODO: fix this test on MACOS and OPENBLAS, the reason is that
...@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana ...@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# transformer, the dataset only works on batch_size=8 now # transformer, the dataset only works on batch_size=8 now
set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz") download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}) --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace seq_pool1_tester {
// Compare Deterministic result
TEST(Analyzer_seq_pool1_compare_determine, compare_determine) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
} // namespace seq_pool1_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace seq_pool1_tester {
TEST(Analyzer_seq_pool1_compare, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
} // namespace seq_pool1_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace seq_pool1_tester {
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_seq_pool1_compare_zero_copy, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig cfg1;
SetConfig(&cfg1);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}
} // namespace seq_pool1_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace seq_pool1_tester {
// Check the fuse status
TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(num_ops, 171);
}
} // namespace seq_pool1_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace seq_pool1_tester {
void profile(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg, use_mkldnn);
std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_seq_pool1_profile, profile) { profile(); }
} // namespace seq_pool1_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
namespace seq_pool1_tester {
// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1 // diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
static const char out_var_name[] = "reduce_sum_0.tmp_0"; static const char out_var_name[] = "reduce_sum_0.tmp_0";
...@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { ...@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass"); cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
} }
void profile(bool use_mkldnn = false) { } // namespace seq_pool1_tester
AnalysisConfig cfg;
SetConfig(&cfg, use_mkldnn);
std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_seq_pool1, profile) { profile(); }
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_seq_pool1, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
// Compare Deterministic result
TEST(Analyzer_seq_pool1, compare_determine) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
// Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(num_ops, 171);
}
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_seq_pool1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig cfg1;
SetConfig(&cfg1);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace transformer_tester {
void compare(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
}
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
TEST(Analyzer_Transformer, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace transformer_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace transformer_tester {
// Check the fuse status
TEST(Analyzer_Transformer, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
}
} // namespace transformer_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
namespace transformer_tester {
void profile(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> outputs;
if (use_mkldnn) {
cfg.EnableMKLDNN();
cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
}
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_Transformer, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
#endif
} // namespace transformer_tester
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -11,11 +11,16 @@ ...@@ -11,11 +11,16 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis {
namespace transformer_tester {
struct DataRecord { struct DataRecord {
std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx; std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
...@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
} }
} }
// Easy for profiling independently. } // namespace transformer_tester
void profile(bool use_mkldnn = false) { } // namespace analysis
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> outputs;
if (use_mkldnn) {
cfg.EnableMKLDNN();
cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
}
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_Transformer, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
#endif
// Check the fuse status
TEST(Analyzer_Transformer, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
void compare(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
}
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
TEST(Analyzer_Transformer, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h"
#include <algorithm> #include <algorithm>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
...@@ -20,6 +21,7 @@ ...@@ -20,6 +21,7 @@
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
...@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) { ...@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t thread_num = 4; size_t thread_num = 4;
size_t sleep_time = 40; size_t sleep_time = 40;
size_t extra_time = 10; size_t extra_time = 20;
// Reserve to perform more tests in the future // Reserve to perform more tests in the future
std::vector<std::shared_ptr<Allocator>> allocators; std::vector<std::shared_ptr<Allocator>> allocators;
......
...@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { ...@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
} }
}; };
// // AbsGrad: dx=dy if x >=0 else -dy
// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
template <typename T>
class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("abs_grad_grad");
// input1: x
op->SetInput("X", this->Input("X"));
// input2: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(this->Attrs());
// output: ddy
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
// ReluGrad: dx = dy if y >= 0 else 0 // ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0
//
template <typename T> template <typename T>
class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public: public:
...@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR( ...@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(), std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
ops::ActFwdInplaceInferer, void>::type); ops::ActFwdInplaceInferer, void>::type);
REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad, REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer); ops::ActivationGradOpInplaceInferer,
ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(
abs_grad_grad,
ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(abs, REGISTER_OP_CPU_KERNEL(abs,
ops::ActivationKernel<paddle::platform::CPUDeviceContext, ops::ActivationKernel<paddle::platform::CPUDeviceContext,
...@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL( ...@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
ops::AbsGradFunctor<int>>, ops::AbsGradFunctor<int>>,
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::AbsGradFunctor<int64_t>>); ops::AbsGradFunctor<int64_t>>);
REGISTER_OP_CPU_KERNEL(
abs_grad_grad,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<plat::float16>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<int>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== register checkpoint ===========================*/ /* ========================== register checkpoint ===========================*/
......
...@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::ExpGradFunctor<plat::float16>>); ops::ExpGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== exp register ============================ */ /* ========================== abs register ============================ */
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>, abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
...@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::AbsGradFunctor<int64_t>>, ops::AbsGradFunctor<int64_t>>,
ops::ActivationGradKernel<plat::CUDADeviceContext, ops::ActivationGradKernel<plat::CUDADeviceContext,
ops::AbsGradFunctor<plat::float16>>); ops::AbsGradFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
abs_grad_grad,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::AbsGradGradFunctor<plat::float16>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int>>,
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::AbsGradGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
...@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel ...@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
} }
}; };
template <typename T>
struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* Out, const framework::Tensor* ddX,
framework::Tensor* ddOut, framework::Tensor* dOut,
framework::Tensor* dX) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
ddout.device(*d) = ddx * x.sign();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
};
template <typename T> template <typename T>
struct ReluGradGradFunctor : public BaseActivationFunctor<T> { struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device> template <typename Device>
......
...@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const { ...@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
op->SetInput("SavedMean", this->Input("SavedMean")); op->SetInput("SavedMean", this->Input("SavedMean"));
op->SetInput("SavedVariance", this->Input("SavedVariance")); op->SetInput("SavedVariance", this->Input("SavedVariance"));
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) { if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
op->SetInput("Mean", this->Input("Mean"));
op->SetInput("Variance", this->Input("Variance")); op->SetInput("Variance", this->Input("Variance"));
} }
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
...@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape( ...@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
"BatchNormDoubleGrad"); "BatchNormDoubleGrad");
} }
OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad"); OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
// check output // check output
OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad"); OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const int C = x_dims[1]; const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
const int C =
((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
? x_dims[1]
: x_dims[x_dims.size() - 1]);
if (ctx->HasOutput("DX")) { if (ctx->HasOutput("DX")) {
ctx->SetOutputDim("DX", x_dims); ctx->SetOutputDim("DX", x_dims);
} }
...@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
Tensor inv_var_tensor; Tensor inv_var_tensor;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_variance = ctx.Input<Tensor>("Variance"); const auto *running_variance = ctx.Input<Tensor>("Variance");
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C}); inv_var_tensor.Resize({C});
T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace()); T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
...@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy, // np.sum(dy,
// axis=(n,h,w)) * (x - mean) * // axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var // inv_var
// * // *
// np.mean(dy, axis=(n,h,w)) - // np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))) // axis=(n,h,w)))
if (ddX) { if (ddX) {
dx_arr += dx_arr +=
...@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
C, sample_size); C, sample_size);
ddy_arr.setZero(); ddy_arr.setZero();
if (use_global_stats) { if (use_global_stats) {
// math: ddy = r * ddx * inv_var // math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if (ddX) { if (ddX) {
ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
} }
...@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
.replicate(1, sample_size) / .replicate(1, sample_size) /
sample_size); sample_size);
} }
if (ddScale && ddBias) { }
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C); if (ddScale) {
Tensor ddscale_tile; ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
ddscale_tile.Resize({C, sample_size}); Tensor ddscale_tile;
EigenArrayMap<T> ddscale_tile_data( ddscale_tile.Resize({C, sample_size});
ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size); EigenArrayMap<T> ddscale_tile_data(
ddscale_tile_data = ddscale_arr.replicate(1, sample_size); ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
}
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C); if (ddBias) {
Tensor ddbias_tile; ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
ddbias_tile.Resize({C, sample_size}); Tensor ddbias_tile;
EigenArrayMap<T> ddbias_tile_data( ddbias_tile.Resize({C, sample_size});
ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size); EigenArrayMap<T> ddbias_tile_data(
ddbias_tile_data = ddbias_arr.replicate(1, sample_size); ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; ddy_arr += ddbias_tile_data;
ddy_arr += ddbias_tile_data;
}
} }
if (data_layout == DataLayout::kNCHW) { if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<paddle::platform::CPUDeviceContext, T>( TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
......
...@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) { ...@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
CreateInputVarDesc(); CreateInputVarDesc();
CreateOutputVarDesc(); CreateOutputVarDesc();
} else { } else {
PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.", PADDLE_THROW(platform::errors::NotFound(
config_.op_type)); "Operator '%s' is not registered in OpTester.", config_.op_type));
} }
if (config_.device_id >= 0) { if (config_.device_id >= 0) {
...@@ -81,7 +81,8 @@ void OpTester::Run() { ...@@ -81,7 +81,8 @@ void OpTester::Run() {
platform::EnableProfiler(platform::ProfilerState::kAll); platform::EnableProfiler(platform::ProfilerState::kAll);
platform::SetDeviceId(config_.device_id); platform::SetDeviceId(config_.device_id);
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW(platform::errors::PermissionDenied(
"'CUDAPlace' is not supported in CPU only device."));
#endif #endif
} }
...@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { ...@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
} else if (str == "fp64") { } else if (str == "fp64") {
return framework::proto::VarType::FP64; return framework::proto::VarType::FP64;
} else { } else {
PADDLE_THROW("Unsupported dtype %s.", str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %s in OpTester.", str.c_str()));
} }
} }
...@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() { ...@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
case framework::proto::AttrType::INTS: case framework::proto::AttrType::INTS:
case framework::proto::AttrType::FLOATS: case framework::proto::AttrType::FLOATS:
case framework::proto::AttrType::STRINGS: case framework::proto::AttrType::STRINGS:
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
platform::errors::Unimplemented("Not supported STRINGS type yet.")); "Unsupported STRINGS type in OpTester yet."));
break; break;
case framework::proto::AttrType::LONG: { case framework::proto::AttrType::LONG: {
int64_t value = StringTo<int64_t>(value_str); int64_t value = StringTo<int64_t>(value_str);
...@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() { ...@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
} break; } break;
case framework::proto::AttrType::LONGS: case framework::proto::AttrType::LONGS:
default: default:
PADDLE_THROW("Unsupport attr type %d", type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupport attr type %d in OpTester.", type));
} }
} }
} }
...@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor, ...@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
} }
is.close(); is.close();
} else { } else {
PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported initializer %s in OpTester.", initializer.c_str()));
} }
if (!platform::is_cpu_place(place_)) { if (!platform::is_cpu_place(place_)) {
...@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
static_cast<double>(1.0), item.second.initializer, static_cast<double>(1.0), item.second.initializer,
item.second.filename); item.second.filename);
} else { } else {
PADDLE_THROW("Unsupported dtype %d.", data_type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %d in OpTester.", data_type));
} }
VLOG(3) << "Set lod for tensor " << var_name; VLOG(3) << "Set lod for tensor " << var_name;
...@@ -473,7 +478,8 @@ std::string OpTester::DebugString() { ...@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
<< "\n"; << "\n";
} break; } break;
default: default:
PADDLE_THROW("Unsupport attr type %d", attr_type); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupport attr type %d in OpTester.", attr_type));
} }
ss << GenSpaces(--count) << "}\n"; ss << GenSpaces(--count) << "}\n";
} }
...@@ -484,8 +490,10 @@ std::string OpTester::DebugString() { ...@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
TEST(op_tester, base) { TEST(op_tester, base) {
if (!FLAGS_op_config_list.empty()) { if (!FLAGS_op_config_list.empty()) {
std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", PADDLE_ENFORCE_EQ(
FLAGS_op_config_list.c_str()); static_cast<bool>(fin), true,
platform::errors::InvalidArgument("OpTester cannot open file %s",
FLAGS_op_config_list.c_str()));
std::vector<OpTesterConfig> op_configs; std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) { while (!fin.eof()) {
VLOG(4) << "Reading config " << op_configs.size() << "..."; VLOG(4) << "Reading config " << op_configs.size() << "...";
......
...@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) { ...@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
} else if (dtype_str == "fp64" || dtype_str == "double") { } else if (dtype_str == "fp64" || dtype_str == "double") {
dtype = "fp64"; dtype = "fp64";
} else { } else {
PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
} }
VLOG(4) << "dtype of input " << name << " is: " << dtype; VLOG(4) << "dtype of input " << name << " is: " << dtype;
} }
...@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) { ...@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
const std::vector<std::string> supported_initializers = {"random", "natural", const std::vector<std::string> supported_initializers = {"random", "natural",
"zeros", "file"}; "zeros", "file"};
if (!Has(supported_initializers, initializer_str)) { if (!Has(supported_initializers, initializer_str)) {
PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported initializer %s in OpInputConfig.",
initializer_str.c_str()));
} }
initializer = initializer_str; initializer = initializer_str;
...@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
} }
} }
EraseEndSep(&lod_str); EraseEndSep(&lod_str);
PADDLE_ENFORCE_GE(lod_str.length(), 4U); PADDLE_ENFORCE_GE(
lod_str.length(), 4U,
platform::errors::InvalidArgument(
"The length of lod string should be "
"equal to or larger than 4. But length of lod string is %zu.",
lod_str.length()));
VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
// Parse the lod_str // Parse the lod_str
...@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
OpTesterConfig::OpTesterConfig(const std::string& filename) { OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", PADDLE_ENFORCE_EQ(
filename.c_str()); static_cast<bool>(fin), true,
platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
filename.c_str()));
Init(fin); Init(fin);
} }
......
...@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase { ...@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker { class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "(Tensor) Dependency of the variable need to sync"); AddInput("X", "(Tensor) Dependency of the variable need to sync")
AddOutput("Out", "(Tensor) Dependency of the variable need to sync"); .AsDuplicable();
AddOutput("Out", "(Tensor) Dependency of the variable need to sync")
.AsDuplicable();
AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0); AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
CSyncCommStream Operator CSyncCommStream Operator
......
...@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T> ...@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// np.sum(dy, // np.sum(dy,
// axis=(h,w)) * (x - mean) * // axis=(h,w)) * (x - mean) *
// (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var // (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
// * // inv_var *
// np.mean(dy, axis=(h,w)) - // np.mean(dy, axis=(h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(h,w)))) // axis=(h,w)))
Tensor x_sub_mean_mul_invstd; Tensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({sample_size, NxC}); x_sub_mean_mul_invstd.Resize({sample_size, NxC});
......
...@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { ...@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
} }
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
template <typename KernelTuple, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchKernelXYZN() { void BenchKernelXYZN() {
using T = typename KernelTuple::data_type; using T = typename KernelTuple::data_type;
...@@ -320,8 +319,15 @@ void BenchKernelSgd() { ...@@ -320,8 +319,15 @@ void BenchKernelSgd() {
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1); PADDLE_ENFORCE_LE(
PADDLE_ENFORCE_GT(n, 0); static_cast<size_t>(upper - lower), n - 1,
paddle::platform::errors::InvalidArgument(
"The range of Sgd (upper - lower) should be equal to or lower "
"than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
static_cast<size_t>(upper - lower), (n - 1)));
PADDLE_ENFORCE_GT(
n, 0, paddle::platform::errors::InvalidArgument(
"The Sgd size should be larger than 0. But the n is %d.", n));
std::vector<int64_t> all, out; std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
all.push_back(i); all.push_back(i);
......
...@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> { ...@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const emb_seq_pool_attr_t& attr) const override { const emb_seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.table_height, 0); PADDLE_ENFORCE_GT(attr.table_height, 0,
PADDLE_ENFORCE_GT(attr.table_width, 0); platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.index_height, 0); "The attribute table_height of EmbSeqPool should "
PADDLE_ENFORCE_GT(attr.index_width, 0); "be larger than 0. But it is %d.",
PADDLE_ENFORCE_GT(attr.out_width, 0); attr.table_height));
PADDLE_ENFORCE_GT(attr.table_width, 0,
platform::errors::InvalidArgument(
"The attribute table_width of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.table_width));
PADDLE_ENFORCE_GT(attr.index_height, 0,
platform::errors::InvalidArgument(
"The attribute index_height of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.index_height));
PADDLE_ENFORCE_GT(attr.index_width, 0,
platform::errors::InvalidArgument(
"The attribute index_width of EmbSeqPool should "
"be larger than 0. But it is %d.",
attr.index_width));
PADDLE_ENFORCE_GT(attr.out_width, 0,
platform::errors::InvalidArgument(
"The attribute out_width of EmbSeqPool should be "
"larger than 0. But it is %d.",
attr.out_width));
return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr)); return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() { ...@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
preCode(); preCode();
int block, rest; int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest); const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0); PADDLE_ENFORCE_GT(
groups.front(), 0,
platform::errors::InvalidArgument("The number of rest registers should "
"be larger than 0. But it is %d.",
groups.front()));
const int block_len = sizeof(float) * block; const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
...@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> { ...@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override { const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0); PADDLE_ENFORCE_GT(
PADDLE_ENFORCE_GT(attr.n, 0); attr.m, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.k, 0); "The attribute m (first matrix's row) of MatMul should "
"be larger than 0. But it is %d.",
attr.m));
PADDLE_ENFORCE_GT(
attr.n, 0, platform::errors::InvalidArgument(
"The attribute n (first matrix's col) of MatMul should "
"be larger than 0. But it is %d.",
attr.n));
PADDLE_ENFORCE_GT(
attr.k, 0, platform::errors::InvalidArgument(
"The attribute k (second matrix's col) of MatMul should "
"be larger than 0. But it is %d.",
attr.k));
return make_unique<MatMulJitCode>(attr, CodeSize(attr)); return make_unique<MatMulJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode { ...@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
size_t code_size = 256 * 1024, size_t code_size = 256 * 1024,
void* code_ptr = nullptr) void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
"Jitcode of matmul only support m==1 (first "
"matrix's row) now. But m is %d.",
m_));
this->genCode(); this->genCode();
} }
......
...@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> { ...@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
} }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override { const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0); PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
PADDLE_ENFORCE_GT(attr.h, 0); "The attribute width of SeqPool should "
"be larger than 0. But it is %d.",
attr.w));
PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
"The attribute height of SeqPool should "
"be larger than 0. But it is %d.",
attr.h));
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr)); return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode { ...@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++; reg_idx++;
} }
PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, PADDLE_ENFORCE_EQ(
"All heights should use same regs"); reg_idx, rest_used_num_regs,
platform::errors::InvalidArgument(
"All heights of SeqPool should use the same number of registers."
"It equals to the numbr of rest registers. But use %d registers "
"and the numbr of rest registers is %d.",
reg_idx, rest_used_num_regs));
for (int i = 0; i < reg_idx; ++i) { for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
} }
......
...@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> { ...@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; } size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override { const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); platform::errors::InvalidArgument(
PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); "The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr.param_width, attr.grad_width));
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr.selected_rows_size, attr.grad_height));
PADDLE_ENFORCE_GE(
attr.selected_rows_size, 0,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or larger than 0. But selected_rows_size is %d.",
attr.selected_rows_size));
return make_unique<SgdJitCode>(attr, CodeSize(attr)); return make_unique<SgdJitCode>(attr, CodeSize(attr));
} }
}; };
......
...@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> { ...@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
} }
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override { std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0); PADDLE_ENFORCE_GT(
w, 0,
platform::errors::InvalidArgument(
"The width of VBroadcast should be larger than 0. But w is %d.",
w));
return make_unique<VBroadcastJitCode>(w, CodeSize(w)); return make_unique<VBroadcastJitCode>(w, CodeSize(w));
} }
}; };
......
...@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void* GenBase::operator new(size_t size) { void* GenBase::operator new(size_t size) {
void* ptr; void* ptr;
constexpr size_t alignment = 32ul; constexpr size_t alignment = 32ul;
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, PADDLE_ENFORCE_EQ(
"GenBase Alloc %ld error!", size); posix_memalign(&ptr, alignment, size), 0,
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); platform::errors::InvalidArgument(
"Jitcode generator (GenBase) allocate %ld memory error!", size));
PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
"Fail to allocate jitcode generator "
"(GenBase) CPU memory: size = %d .",
size));
return ptr; return ptr;
} }
......
...@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) { ...@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
ONE_CASE(kEmbSeqPool); ONE_CASE(kEmbSeqPool);
ONE_CASE(kSgd); ONE_CASE(kSgd);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt); PADDLE_THROW(platform::errors::Unimplemented(
"JIT kernel do not support type: %d.", kt));
return "NOT JITKernel"; return "NOT JITKernel";
} }
return nullptr; return nullptr;
...@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) { ...@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
ONE_CASE(kAvg); ONE_CASE(kAvg);
ONE_CASE(kSqrt); ONE_CASE(kSqrt);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", tp); PADDLE_THROW(platform::errors::Unimplemented(
"SeqPool JIT kernel do not support type: %d.", tp));
return "NOT PoolType"; return "NOT PoolType";
} }
return nullptr; return nullptr;
...@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) { ...@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
} else if (lower == "tanh" || lower == "vtanh") { } else if (lower == "tanh" || lower == "vtanh") {
return kVTanh; return kVTanh;
} }
PADDLE_THROW("Not support type: %s, or forget to add this case", act); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s.", act));
return kNone; return kNone;
} }
...@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) { ...@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest; int block, rest;
const auto groups = packed_groups(n, k, &block, &rest); const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) { std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
"Each element of groups should be larger than "
"0. However the element: %d doesn't satify.",
i));
}); });
int sum = std::accumulate(groups.begin(), groups.end(), 0); int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float)); std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(sum * block, n, PADDLE_ENFORCE_GE(sum * block, n,
"The packed n should be equal to or larger than n"); platform::errors::InvalidArgument(
"The packed n (sum * block) should be equal to or "
"larger than n (matmul row size). "
"However, the packed n is %d and n is %d.",
sum * block, n));
const int block_len = sizeof(float) * block; const int block_len = sizeof(float) * block;
int n_offset = 0; int n_offset = 0;
...@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) { ...@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
template <typename T> template <typename T>
typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights( typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
const T* src, T* dst, int n, int k) { const T* src, T* dst, int n, int k) {
PADDLE_THROW("Only support pack with float type."); PADDLE_THROW(platform::errors::Unimplemented(
"Only supports pack weights with float type."));
} }
} // namespace jit } // namespace jit
......
...@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() { ...@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool::Instance().AllKernels(); auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
auto ref_iter = ref_pool.find(kkey); auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(), PADDLE_ENFORCE_NE(
"Every Kernel should have reference function."); ref_iter, ref_pool.end(),
platform::errors::PreconditionNotMet(
"Every Refer Kernel of jitcode should have reference function."));
auto& ref_impls = ref_iter->second; auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) { for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get()); auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
...@@ -101,7 +103,9 @@ template <typename KernelTuple> ...@@ -101,7 +103,9 @@ template <typename KernelTuple>
inline typename KernelTuple::func_type GetReferFunc() { inline typename KernelTuple::func_type GetReferFunc() {
auto ker = GetReferKernel<KernelTuple>(); auto ker = GetReferKernel<KernelTuple>();
auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker); auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
PADDLE_ENFORCE(p, "The Refer kernel should exsit"); PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
"Get the reference code of kernel in CPU "
"failed. The Refer kernel should exsit."));
return p->GetFunc(); return p->GetFunc();
} }
...@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels( ...@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace. // The last implementation should be reference function on CPUPlace.
auto ref = GetReferKernel<KernelTuple>(); auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
"Get all candicate kernel in CPU failed. "
"The Refer Kernel can not be empty."));
res.emplace_back(ref); res.emplace_back(ref);
return res; return res;
} }
...@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { ...@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std::string name = k->ImplType(); std::string name = k->ImplType();
if (name == "JitCode") { if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k); auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); PADDLE_ENFORCE_NOT_NULL(i,
platform::errors::InvalidArgument(
"Generate jitcode kernel (GenBase) failed."));
res.emplace_back(std::make_pair(name, i->template getCode<Func>())); res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else { } else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k); auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail."); PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
"Kernel cast (KernelMore) failed."));
res.emplace_back(std::make_pair(name, i->GetFunc())); res.emplace_back(std::make_pair(name, i->GetFunc()));
} }
} }
...@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace> ...@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc( typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) { const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr); auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL); PADDLE_ENFORCE_GE(funcs.size(), 1UL,
platform::errors::InvalidArgument(
"The candicate jit kernel is at least one in CPU."));
// Here could do some runtime benchmark of this attr and return the best one. // Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one, // But yet just get the first one as the default best one,
// which is searched in order and tuned by offline. // which is searched in order and tuned by offline.
......
...@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT ...@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
} else if (type == kVIdentity) { } else if (type == kVIdentity) {
return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d); return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
} }
PADDLE_THROW("Not support type: %s", type); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s", type));
return nullptr; return nullptr;
} }
......
...@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { ...@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
template <typename T> template <typename T>
void EmbSeqPool(const T* table, const int64_t* idx, T* out, void EmbSeqPool(const T* table, const int64_t* idx, T* out,
const emb_seq_pool_attr_t* attr) { const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); PADDLE_ENFORCE_EQ(
attr->table_width * attr->index_width, attr->out_width,
platform::errors::InvalidArgument(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d, "
"out_width is %d.",
attr->table_width * attr->index_width, attr->out_width));
auto check_idx_value_valid = [&](int64_t i) { auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", PADDLE_ENFORCE_LT(
idx[i], i); idx[i], attr->table_height,
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); platform::errors::InvalidArgument(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d.",
i, idx[i], attr->table_height));
PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d.",
i, idx[i]));
}; };
for (int64_t w = 0; w != attr->index_width; ++w) { for (int64_t w = 0; w != attr->index_width; ++w) {
...@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { ...@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
template <typename T> template <typename T>
void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
T* out, const sgd_attr_t* attr) { T* out, const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); platform::errors::InvalidArgument(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr->param_width, attr->grad_width));
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr->selected_rows_size, attr->grad_height));
T scalar = -lr[0]; T scalar = -lr[0];
int width = attr->grad_width; int width = attr->grad_width;
if (out == param) { if (out == param) {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
VAXPY(scalar, grad + i * width, out + h_idx * width, width); VAXPY(scalar, grad + i * width, out + h_idx * width, width);
} }
} else { } else {
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
VScal(&scalar, grad + i * width, out + h_idx * width, width); VScal(&scalar, grad + i * width, out + h_idx * width, width);
VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width, VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
width); width);
......
...@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT ...@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT
} else if (type == kVIdentity) { } else if (type == kVIdentity) {
return VIdentity<T>; return VIdentity<T>;
} }
PADDLE_THROW("Not support type: %s", type); PADDLE_THROW(platform::errors::Unimplemented(
"Act JIT kernel do not support type: %s.", type));
return nullptr; return nullptr;
} }
...@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { ...@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
template <typename T> template <typename T>
void EmbSeqPool(const T* table, const int64_t* idx, T* out, void EmbSeqPool(const T* table, const int64_t* idx, T* out,
const emb_seq_pool_attr_t* attr) { const emb_seq_pool_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); PADDLE_ENFORCE_EQ(
attr->table_width * attr->index_width, attr->out_width,
platform::errors::InvalidArgument(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d and "
"out_width is %d.",
attr->table_width * attr->index_width, attr->out_width));
auto check_idx_value_valid = [&](int64_t i) { auto check_idx_value_valid = [&](int64_t i) {
PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", PADDLE_ENFORCE_LT(
idx[i], i); idx[i], attr->table_height,
PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); platform::errors::InvalidArgument(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d.",
i, idx[i], attr->table_height));
PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d.",
i, idx[i]));
}; };
for (int64_t w = 0; w != attr->index_width; ++w) { for (int64_t w = 0; w != attr->index_width; ++w) {
...@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, ...@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
template <typename T> template <typename T>
void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
T* out, const sgd_attr_t* attr) { T* out, const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); platform::errors::InvalidArgument(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d.",
attr->param_width, attr->grad_width));
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
platform::errors::InvalidArgument(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d.",
attr->selected_rows_size, attr->grad_height));
for (int64_t i = 0; i < attr->selected_rows_size; ++i) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i]; auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height); PADDLE_ENFORCE_LT(h_idx, attr->param_height,
PADDLE_ENFORCE_GE(h_idx, 0); platform::errors::InvalidArgument(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d.",
i, h_idx, attr->param_height));
PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d.",
i, h_idx));
for (int64_t j = 0; j < attr->grad_width; ++j) { for (int64_t j = 0; j < attr->grad_width; ++j) {
out[h_idx * attr->grad_width + j] = out[h_idx * attr->grad_width + j] =
param[h_idx * attr->grad_width + j] - param[h_idx * attr->grad_width + j] -
......
...@@ -850,8 +850,15 @@ void TestKernelSgd() { ...@@ -850,8 +850,15 @@ void TestKernelSgd() {
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1); PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
PADDLE_ENFORCE_GT(n, 0); paddle::platform::errors::InvalidArgument(
"The range of Sgd (upper - lower) should be lower "
"than n-1 (Sgd size -1). But the upper - lower is %d "
"and n-1 is %d.",
static_cast<size_t>(upper - lower), n - 1));
PADDLE_ENFORCE_GT(
n, 0, paddle::platform::errors::InvalidArgument(
"The Sgd size should be larger than 0. But the n is %d.", n));
std::vector<int64_t> all, out; std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
all.push_back(i); all.push_back(i);
......
...@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N, ...@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
}); });
} }
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::GEMV(
bool trans_a, int M, int N, platform::float16 alpha,
const platform::float16 *A, const platform::float16 *B,
platform::float16 beta, platform::float16 *C) const {
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if (trans_a) {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
alpha, B, A, beta, C);
} else {
this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
alpha, A, B, beta, C);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::BatchedGEMM( void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM( ...@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
} }
} }
template <>
template <>
inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
platform::float16 alpha, const platform::float16 **A,
const platform::float16 **B, platform::float16 beta, platform::float16 **C,
int batchCount) const {
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
B[k], beta, C[k]);
}
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
......
...@@ -17,10 +17,12 @@ limitations under the License. */ ...@@ -17,10 +17,12 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plf = paddle::platform; namespace plf = paddle::platform;
REGISTER_OP_CUDA_KERNEL(matmul_v2, REGISTER_OP_CUDA_KERNEL(
ops::MatMulV2Kernel<plf::CUDADeviceContext, float>, matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
ops::MatMulV2Kernel<plf::CUDADeviceContext, double>); ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>, matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>); ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
...@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if (trans_y) { if (trans_y) {
const int M = Y->numel() / N; const int M = Y->numel() / N;
VLOG(3) << "MatMul's case 2"; VLOG(3) << "MatMul's case 2";
blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>()); blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
const int M = y_dims[y_ndim - 1]; const int M = y_dims[y_ndim - 1];
const int batch_size = Y->numel() / (M * N); const int batch_size = Y->numel() / (M * N);
if (batch_size == 1) { if (batch_size == 1) {
VLOG(3) << "MatMul's case 3"; VLOG(3) << "MatMul's case 3";
blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>()); blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 4"; VLOG(3) << "MatMul's case 4";
blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data, blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
x_data, 0, Out->data<T>(), batch_size, M * N, 0); y_data, x_data, static_cast<T>(0), Out->data<T>(),
batch_size, M * N, 0);
} }
} }
return; return;
...@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
const int batch_size = X->numel() / (M * N); const int batch_size = X->numel() / (M * N);
if (batch_size == 1) { if (batch_size == 1) {
VLOG(3) << "MatMul's case 5"; VLOG(3) << "MatMul's case 5";
blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>()); blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 6"; VLOG(3) << "MatMul's case 6";
blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data, blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
y_data, 0, Out->data<T>(), batch_size, M * N, 0); x_data, y_data, static_cast<T>(0), Out->data<T>(),
batch_size, M * N, 0);
} }
} else { } else {
const int M = X->numel() / N; const int M = X->numel() / N;
VLOG(3) << "MatMul's case 7"; VLOG(3) << "MatMul's case 7";
blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>()); blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
static_cast<T>(0), Out->data<T>());
} }
return; return;
} }
...@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if (x_batch_size == 1 && y_batch_size == 1) { if (x_batch_size == 1 && y_batch_size == 1) {
VLOG(3) << "MatMul's case 8"; VLOG(3) << "MatMul's case 8";
blas.GEMM(trans_x ? CblasTrans : CblasNoTrans, blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
y_data, 0.0f, Out->data<T>()); x_data, y_data, static_cast<T>(0), Out->data<T>());
} else if (x_batch_size == 1) { } else if (x_batch_size == 1) {
if (M == 1 && trans_y) { if (M == 1 && trans_y) {
VLOG(3) << "MatMul's case 9"; VLOG(3) << "MatMul's case 9";
blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f, blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
Out->data<T>()); static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 10"; VLOG(3) << "MatMul's case 10";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
x_data, y_data, 0, Out->data<T>(), out_batch_size, 0, static_cast<T>(1), x_data, y_data, static_cast<T>(0),
K * N); Out->data<T>(), out_batch_size, 0, K * N);
} }
} else if (y_batch_size == 1) { } else if (y_batch_size == 1) {
if (!trans_x) { if (!trans_x) {
VLOG(3) << "MatMul's case 11"; VLOG(3) << "MatMul's case 11";
blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans, blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f, x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
Out->data<T>()); static_cast<T>(0), Out->data<T>());
} else { } else {
VLOG(3) << "MatMul's case 12"; VLOG(3) << "MatMul's case 12";
blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K, blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size, static_cast<T>(1), x_data, y_data, static_cast<T>(0),
M * K, 0); Out->data<T>(), out_batch_size, M * K, 0);
} }
} else if (!is_broadcast_dims) { } else if (!is_broadcast_dims) {
VLOG(3) << "MatMul's case 13"; VLOG(3) << "MatMul's case 13";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N); static_cast<T>(1), x_data, y_data, static_cast<T>(0),
Out->data<T>(), out_batch_size, M * K, K * N);
} else { } else {
// in the case, can't use stridedgemm // in the case, can't use stridedgemm
std::vector<const T*> x_ptr(out_batch_size); std::vector<const T*> x_ptr(out_batch_size);
...@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, ...@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
} }
VLOG(3) << "MatMul's case 14"; VLOG(3) << "MatMul's case 14";
blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(), static_cast<T>(1), x_ptr.data(), y_ptr.data(),
out_batch_size); static_cast<T>(0), out_ptr.data(), out_batch_size);
} }
} }
......
...@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout; ...@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy, // np.sum(dy,
// axis=(n,h,w)) * (x - mean) * // axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var // inv_var
// * // *
// np.mean(dy, axis=(n,h,w)) - // np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))) // axis=(n,h,w)))
template <typename T, int BlockDim, framework::DataLayout layout> template <typename T, int BlockDim, framework::DataLayout layout>
__global__ void DoubleGradComputeDX(const T *x, const T *mean, __global__ void DoubleGradComputeDX(const T *x, const T *mean,
...@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean, ...@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
? (j / sample_size * C + i) * sample_size + j % sample_size ? (j / sample_size * C + i) * sample_size + j % sample_size
: j * outer_size + i; : j * outer_size + i;
dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val - dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
(x[index] - mean_val) * var_val * (x[index] - mean_val) * var_val * var_val *
dy_mul_x_sub_mean_sum_val * var_val / inner_size) * dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
ddscale[i]; ddscale[i];
} }
...@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal( ...@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
} }
// math: dx = ddscale * dy * inv_var // math: dx = ddscale * dy * inv_var
// math: ddy = scale * ddx * inv_var
template <typename T, framework::DataLayout layout> template <typename T, framework::DataLayout layout>
__global__ void DoubleGradComputeDataWithGlobal( __global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
const T *dy, const T *scale, const T *variance, const double epsilon, const T *variance,
const int C, const int sample_size, const int num, T *dx) { const double epsilon, const int C,
const int sample_size,
const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x; int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
if (scale != nullptr) { if (ddscale != nullptr) {
for (int i = gid; i < num; i += stride) { for (int i = gid; i < num; i += stride) {
const int c = const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon); T inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = dy[i] * scale[c] * inv_var; dx[i] = dy[i] * ddscale[c] * inv_var;
}
}
}
// math: ddy = scale * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
template <typename T, framework::DataLayout layout>
__global__ void DoubleGradComputeDDYWithGlobal(
const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
const T *ddbias, const T *ddscale, const double epsilon, const int C,
const int sample_size, const int num, T *ddy) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
if (ddx != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon);
ddy[i] += ddx[i] * scale[c] * inv_var;
}
}
__syncthreads();
if (ddscale != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
T inv_var = 1.0 / sqrt(variance[c] + epsilon);
ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
}
}
__syncthreads();
if (ddbias != nullptr) {
for (int i = gid; i < num; i += stride) {
const int c =
layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
ddy[i] += ddbias[c];
} }
} }
} }
...@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *mean_data, *variance_data; const T *mean_data, *variance_data;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>(); const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data;
variance_data = running_var_data; variance_data = running_var_data;
} else { } else {
const T *smean_data = Saved_mean->data<T>(); const T *smean_data = Saved_mean->data<T>();
...@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant(dev_ctx, dX, static_cast<T>(0)); set_constant(dev_ctx, dX, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} else { } else {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
...@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant(dev_ctx, ddY, static_cast<T>(0)); set_constant(dev_ctx, ddY, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else { } else {
DoubleGradComputeDataWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
......
...@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) { ...@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
// NOTE: Converting int64 to int32 may cause data overflow. // NOTE: Converting int64 to int32 may cause data overflow.
vec_new_data = std::vector<T>(data, data + x->numel()); vec_new_data = std::vector<T>(data, data + x->numel());
} else { } else {
PADDLE_THROW("The dtype of Tensor must be int32 or int64."); PADDLE_THROW(platform::errors::InvalidArgument(
"The dtype of Tensor must be int32 or int64, but received: %s",
x->type()));
} }
return vec_new_data; return vec_new_data;
} }
...@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList( ...@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
for (size_t i = 0; i < list_tensor.size(); ++i) { for (size_t i = 0; i < list_tensor.size(); ++i) {
auto tensor = list_tensor[i]; auto tensor = list_tensor[i];
PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}), PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
"ShapeError: The shape of Tensor in list must be [1]. " platform::errors::InvalidArgument(
"But received the shape " "The shape of Tensor in list must be [1]. "
"is [%s]", "But received its shape "
tensor->dims()); "is [%s]",
tensor->dims()));
if (tensor->type() == framework::proto::VarType::INT32) { if (tensor->type() == framework::proto::VarType::INT32) {
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
...@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList( ...@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>())); vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
} }
} else { } else {
PADDLE_THROW("The dtype of Tensor in list must be int32 or int64."); PADDLE_THROW(platform::errors::InvalidArgument(
"The dtype of Tensor in list must be int32 or int64, but received: "
"%s",
tensor->type()));
} }
} }
return vec_new_data; return vec_new_data;
......
...@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int); ...@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int); USE_CUDA_ATOMIC(Max, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here. // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t // It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
if (*address >= val) {
return;
}
unsigned long long int old = *address, assumed;
do {
assumed = old;
if (assumed >= val) {
break;
}
old = atomicCAS(address, assumed, val);
} while (assumed != old);
}
#endif
CUDA_ATOMIC_WRAPPER(Max, int64_t) { CUDA_ATOMIC_WRAPPER(Max, int64_t) {
// Here, we check long long int must be int64_t. // Here, we check long long int must be int64_t.
...@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int); ...@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC(Min, unsigned int); USE_CUDA_ATOMIC(Min, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here. // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t // It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
if (*address <= val) {
return;
}
unsigned long long int old = *address, assumed;
do {
assumed = old;
if (assumed <= val) {
break;
}
old = atomicCAS(address, assumed, val);
} while (assumed != old);
}
#endif
CUDA_ATOMIC_WRAPPER(Min, int64_t) { CUDA_ATOMIC_WRAPPER(Min, int64_t) {
// Here, we check long long int must be int64_t. // Here, we check long long int must be int64_t.
......
...@@ -15,4 +15,5 @@ ...@@ -15,4 +15,5 @@
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle as pd import paddle as pd
fluid.install_check.run_check()
print(pd.__version__) print(pd.__version__)
...@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON ...@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem -------set cache build work directory----------- rem -------set cache build work directory-----------
rmdir build\python /s/q
if "%WITH_CACHE%"=="OFF" ( if "%WITH_CACHE%"=="OFF" (
rmdir build /s/q rmdir build /s/q
goto :mkbuild goto :mkbuild
...@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" ( ...@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
set day_now=%datetime:~6,2% set day_now=%datetime:~6,2%
set day_before=-1 set day_before=-1
set /p day_before=<day.txt set /p day_before=< %work_dir%\..\day.txt
if %day_now% NEQ %day_before% ( if %day_now% NEQ %day_before% (
echo %day_now% > day.txt echo %day_now% > %work_dir%\..\day.txt
type day.txt type %work_dir%\..\day.txt
rmdir build /s/q rmdir build /s/q
) )
git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat" git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
...@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time: ...@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
if %ERRORLEVEL% NEQ 0 ( if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1 set /a build_times=%build_times%+1
if %build_times% GTR 3 ( if %build_times% GTR 2 (
exit /b 7 exit /b 7
) else ( ) else (
echo Build third_party failed, will retry! echo Build third_party failed, will retry!
...@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time: ...@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
if %ERRORLEVEL% NEQ 0 ( if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1 set /a build_times=%build_times%+1
if %build_times% GTR 2 ( if %build_times% GTR 1 (
exit /b 7 exit /b 7
) else ( ) else (
echo Build Paddle failed, will retry! echo Build Paddle failed, will retry!
...@@ -301,6 +302,7 @@ goto:eof ...@@ -301,6 +302,7 @@ goto:eof
call paddle_winci\Scripts\deactivate.bat 2>NUL call paddle_winci\Scripts\deactivate.bat 2>NUL
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%# for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10% set end=%end:~4,10%
call :timestamp "%start%" "%end%" "1 card TestCases Total"
call :timestamp "%start%" "%end%" "TestCases Total" call :timestamp "%start%" "%end%" "TestCases Total"
echo Running unit tests failed, will exit! echo Running unit tests failed, will exit!
exit /b 8 exit /b 8
...@@ -313,6 +315,7 @@ echo ======================================== ...@@ -313,6 +315,7 @@ echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%# for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10% set end=%end:~4,10%
call :timestamp "%start%" "%end%" "1 card TestCases Total"
call :timestamp "%start%" "%end%" "TestCases Total" call :timestamp "%start%" "%end%" "TestCases Total"
cd %work_dir%\paddle\fluid\inference\api\demo_ci cd %work_dir%\paddle\fluid\inference\api\demo_ci
...@@ -345,6 +348,8 @@ echo ============================================ >> check_change_of_unitte ...@@ -345,6 +348,8 @@ echo ============================================ >> check_change_of_unitte
echo EOF>> check_change_of_unittest.sh echo EOF>> check_change_of_unittest.sh
echo spec_path=$(pwd)/UNITTEST_PR.spec>> check_change_of_unittest.sh echo spec_path=$(pwd)/UNITTEST_PR.spec>> check_change_of_unittest.sh
echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh
echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>> check_change_of_unittest.sh echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>> check_change_of_unittest.sh
echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>> check_change_of_unittest.sh echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>> check_change_of_unittest.sh
echo if [ "$origin_upstream_url" == "" ]; then>> check_change_of_unittest.sh echo if [ "$origin_upstream_url" == "" ]; then>> check_change_of_unittest.sh
...@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL ...@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
taskkill /f /im rc.exe 2>NUL taskkill /f /im rc.exe 2>NUL
wmic process where name="op_function_generator.exe" call terminate 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL
taskkill /f /im python.exe 2>NUL taskkill /f /im python.exe 2>NUL
call paddle_winci\Scripts\deactivate.bat 2>NUL
del %PADDLE_WHL_FILE_WIN%
taskkill /f /im python.exe 2>NUL taskkill /f /im python.exe 2>NUL
echo Windows CI run successfully! echo Windows CI run successfully!
exit /b 0 exit /b 0
......
...@@ -988,11 +988,6 @@ set +x ...@@ -988,11 +988,6 @@ set +x
fi fi
read testcase <<< $(echo "$line"|grep -oEi "\w+$") read testcase <<< $(echo "$line"|grep -oEi "\w+$")
if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
echo $testcase" is disabled."
continue
fi
if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
echo $testcase" will only run at night." echo $testcase" will only run at night."
continue continue
......
...@@ -845,6 +845,29 @@ class DistributedStrategy(object): ...@@ -845,6 +845,29 @@ class DistributedStrategy(object):
check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs") check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
assign_configs_value(self.strategy.dgc_configs, configs) assign_configs_value(self.strategy.dgc_configs, configs)
@property
def fp16_allreduce(self):
"""
Indicating whether we are using fp16 gradient allreduce training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fp16_allreduce = True # by default this is false
"""
return self.strategy.fp16_allreduce
@fp16_allreduce.setter
@is_strict_auto
def fp16_allreduce(self, flag):
if not isinstance(flag, bool):
raise TypeError('fp16_allreduce must be value of bool type')
self.strategy.fp16_allreduce = flag
@property @property
def gradient_merge(self): def gradient_merge(self):
""" """
......
...@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer ...@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
from .dgc_optimizer import DGCOptimizer from .dgc_optimizer import DGCOptimizer
from .lamb_optimizer import LambOptimizer from .lamb_optimizer import LambOptimizer
from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid import core, framework, unique_name
from .meta_optimizer_base import MetaOptimizerBase
class FP16AllReduceOptimizer(MetaOptimizerBase):
def __init__(self, optimizer):
super(FP16AllReduceOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [
"LarsOptimizer",
"LambOptimizer",
"RecomputeOptimizer",
"LocalSGDOptimizer",
"GradientMergeOptimizer",
"GraphExecutionOptimizer",
"AdaptiveLocalSGDOptimizer",
]
self.meta_optimizers_black_list = ["DGCOptimizer"]
def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
user_defined_strategy):
super(FP16AllReduceOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.fp16_allreduce:
return True
return False
def _disable_strategy(self, dist_strategy):
dist_strategy.fp16_allreduce = False
def _enable_strategy(self, dist_strategy, context=None):
dist_strategy.fp16_allreduce = True
@staticmethod
def fp16_compression(param_and_grads):
"""
Compress fp32 gradients to fp16 during allreduce.
"""
op_maker = core.op_proto_and_checker_maker
new_param_and_grads = [] # param, grad, is_cast
# cast grad from fp32->fp16 before allreduce,
for param, grad in param_and_grads:
if grad is None or grad.dtype != core.VarDesc.VarType.FP32:
new_param_and_grads.append((param, grad, False))
continue
op = grad.op
block = grad.block
var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
if param.name not in var_attr:
new_param_and_grads.append((param, grad, False))
continue
# remove (param, grad) from op_role_var
var_attr.remove(param.name)
var_attr.remove(grad.name)
if len(var_attr) > 1:
op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
else:
op._remove_attr(op_maker.kOpRoleVarAttrName())
new_grad = block.create_var(
name=unique_name.generate(grad.name + ".cast_fp16"),
dtype=core.VarDesc.VarType.FP16,
persistable=False,
stop_gradient=True)
with block.program._backward_role_guard():
cast_op = block.append_op(
type="cast",
inputs={"X": grad},
outputs={"Out": new_grad},
attrs={
"in_dtype": core.VarDesc.VarType.FP32,
"out_dtype": core.VarDesc.VarType.FP16
},
stop_gradient=True)
backward = op_maker.OpRole.Backward
cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
cast_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param.name, new_grad.name])
new_grad.op = cast_op
new_param_and_grads.append((param, new_grad, True))
ret_param_and_grads = []
# cast grad from fp16->fp32 after allreduce.
# NOTE. Now we split fp16 compression into two for loops,
# if we do not separate them, fuse allreduce will wrong.
# This must be the problem of fuse allreduce pass, need
# fixed in future.
for param, grad, cast in new_param_and_grads:
if not cast:
ret_param_and_grads.append((param, grad))
continue
block = grad.block
new_grad = block.create_var(
name=unique_name.generate(grad.name + ".cast_fp32"),
dtype=core.VarDesc.VarType.FP32,
persistable=False,
stop_gradient=True)
with block.program._optimized_guard(
[param, grad]), framework.name_scope('fp16_allreduce'):
cast_op = block.append_op(
type="cast",
inputs={"X": grad},
outputs={"Out": new_grad},
attrs={
"in_dtype": core.VarDesc.VarType.FP16,
"out_dtype": core.VarDesc.VarType.FP32
},
stop_gradient=True)
ret_param_and_grads.append((param, new_grad))
return ret_param_and_grads
def apply_optimize(self, loss, startup_program, params_grads):
new_params_grads = self.fp16_compression(params_grads)
return self.inner_opt.apply_optimize(
loss,
startup_program=startup_program,
params_grads=new_params_grads)
...@@ -1355,7 +1355,7 @@ class Executor(object): ...@@ -1355,7 +1355,7 @@ class Executor(object):
if not program._fleet_opt is None: if not program._fleet_opt is None:
if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker": if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
is_heter = 1 is_heter = 1
if program._fleet_opt("trainer", "") == "HeterXpuTrainer": if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
is_heter = 1 is_heter = 1
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
......
...@@ -167,10 +167,10 @@ class DataLoader(object): ...@@ -167,10 +167,10 @@ class DataLoader(object):
The variables should be created by :code:`fluid.data()`. The variables should be created by :code:`fluid.data()`.
:attr:`feed_list` must be set if :attr:`return_list` is :attr:`feed_list` must be set if :attr:`return_list` is
False. Default None. False. Default None.
places(list(Place)|tuple(Place)): a list of Place, to put data places(list(Place)|tuple(Place)|optional): a list of Place,
onto, :attr:`places` must be set in both static graph and to put data onto, :attr:`places` can be None, if
dynamic graph mode, in dynamic graph mode, place number must :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
be 1. Default None. will be used. Default None.
return_list (bool): whether the return value on each device is return_list (bool): whether the return value on each device is
presented as a list. If :attr:`return_list=False`, the return presented as a list. If :attr:`return_list=False`, the return
value on each device would be a dict of str -> LoDTensor, where value on each device would be a dict of str -> LoDTensor, where
...@@ -222,6 +222,8 @@ class DataLoader(object): ...@@ -222,6 +222,8 @@ class DataLoader(object):
.. code-block:: python .. code-block:: python
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.io import Dataset, BatchSampler, DataLoader from paddle.io import Dataset, BatchSampler, DataLoader
...@@ -247,11 +249,48 @@ class DataLoader(object): ...@@ -247,11 +249,48 @@ class DataLoader(object):
def __len__(self): def __len__(self):
return self.num_samples return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
# get places # get places
places = fluid.cuda_places() if USE_GPU else fluid.cpu_places() places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
# -------------------- static graph --------------------- # -------------------- static graph ---------------------
paddle.enable_static()
def simple_net(image, label): def simple_net(image, label):
fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax') fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label) cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
...@@ -270,11 +309,8 @@ class DataLoader(object): ...@@ -270,11 +309,8 @@ class DataLoader(object):
prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name) prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
loader = DataLoader(dataset, loader = DataLoader(dataset,
feed_list=[image, label], feed_list=[image, label],
places=places,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
shuffle=True, shuffle=True,
drop_last=True, drop_last=True,
...@@ -287,39 +323,6 @@ class DataLoader(object): ...@@ -287,39 +323,6 @@ class DataLoader(object):
# ------------------------------------------------------- # -------------------------------------------------------
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
places=places[0],
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
.. note:: .. note::
For reading iterable dataset with multiprocess Dataloader, For reading iterable dataset with multiprocess Dataloader,
...@@ -356,11 +359,9 @@ class DataLoader(object): ...@@ -356,11 +359,9 @@ class DataLoader(object):
"feed_list should be set when return_list=False" "feed_list should be set when return_list=False"
self.feed_list = feed_list self.feed_list = feed_list
assert places is not None, "places cannot be None" if places is None:
places = _current_expected_place()
self.places = _convert_places(places) self.places = _convert_places(places)
if in_dygraph_mode():
assert len(self.places) == 1, \
"Number of places must be 1 in dygraph mode"
assert num_workers >= 0, "num_workers should be a non-negative value" assert num_workers >= 0, "num_workers should be a non-negative value"
if num_workers > 0 and (sys.platform == 'darwin' or if num_workers > 0 and (sys.platform == 'darwin' or
......
...@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) ...@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base) list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
...@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op) ...@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
# disable this unittest temporarily # disable this unittest temporarily
list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
list(REMOVE_ITEM TEST_OPS test_sampling_id_op) list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
list(REMOVE_ITEM TEST_OPS test_paddle_save_load)
if (APPLE OR WIN32) if (APPLE OR WIN32)
list(REMOVE_ITEM TEST_OPS test_dataset) list(REMOVE_ITEM TEST_OPS test_dataset)
...@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE) ...@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS}) py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS}) py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
from paddle.distributed.fleet.meta_optimizers import FP16AllReduceOptimizer as FP16AllReduce
from test_dist_base import TestDistRunnerBase, runtime_main
from dist_mnist import cnn_model
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.MomentumOptimizer(
learning_rate=0.001, momentum=0.9)
opt = FP16AllReduce(opt)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class ConvActivationMkldnnFusePassTest(InferencePassTest):
def setUp(self):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 100, 100], dtype="float32")
conv_out = fluid.layers.conv2d(
data,
num_filters=self.conv_num_filters,
filter_size=self.conv_filter_size,
bias_attr=self.conv_bias_attr,
act=self.act)
self.feeds = {
"data": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.fetch_list = [conv_out]
self.enable_mkldnn = True
def set_params(self):
self.conv_num_filters = 3
self.conv_filter_size = 3
self.conv_bias_attr = False
self.act = "relu"
self.pass_name = 'conv_relu_mkldnn_fuse_pass'
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
def test_pass_compatible(self):
self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
class ConvActivationMkldnnFusePassTest_1(ConvActivationMkldnnFusePassTest):
def set_params(self):
self.conv_num_filters = 5
self.conv_filter_size = 5
self.conv_bias_attr = True
self.act = "relu"
self.pass_name = 'conv_relu_mkldnn_fuse_pass'
class ConvActivationMkldnnFusePassTest_2(ConvActivationMkldnnFusePassTest):
def set_params(self):
self.conv_num_filters = 3
self.conv_filter_size = 3
self.conv_bias_attr = False
self.act = "leaky_relu"
self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
class ConvActivationMkldnnFusePassTest_3(ConvActivationMkldnnFusePassTest):
def set_params(self):
self.conv_num_filters = 5
self.conv_filter_size = 5
self.conv_bias_attr = True
self.act = "leaky_relu"
self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
def set_params(self):
self.conv_num_filters = 3
self.conv_filter_size = 3
self.conv_bias_attr = False
self.act = "relu6"
self.pass_name = 'conv_relu6_mkldnn_fuse_pass'
class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
def set_params(self):
self.conv_num_filters = 5
self.conv_filter_size = 5
self.conv_bias_attr = True
self.act = "swish"
self.pass_name = 'conv_swish_mkldnn_fuse_pass'
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class ConvConcatReluMkldnnFusePassTest_0(InferencePassTest):
def setUp(self):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data_1 = fluid.data(
name="data_1", shape=[-1, 3, 100, 100], dtype="float32")
data_2 = fluid.data(
name="data_2", shape=[-1, 3, 100, 100], dtype="float32")
conv_1 = fluid.layers.conv2d(
data_1,
num_filters=self.conv1_num_filters,
filter_size=self.conv1_filter_size,
padding=self.conv1_padding,
bias_attr=self.conv1_bias_attr)
conv_2 = fluid.layers.conv2d(
data_2,
num_filters=self.conv2_num_filters,
filter_size=self.conv2_filter_size,
padding=self.conv2_padding,
bias_attr=self.conv2_bias_attr)
concat = fluid.layers.concat(
[conv_1, conv_2], axis=self.concat_axis)
out = fluid.layers.relu(concat)
self.feeds = {
"data_1": np.random.random((1, 3, 100, 100)).astype("float32"),
"data_2": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.fetch_list = [out]
self.enable_mkldnn = True
def set_params(self):
self.conv1_num_filters = 3
self.conv1_filter_size = 3
self.conv1_padding = 0
self.conv1_bias_attr = False
self.conv2_num_filters = 3
self.conv2_filter_size = 3
self.conv2_padding = 0
self.conv2_bias_attr = False
self.concat_axis = 0
self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
def test_pass_compatible(self):
self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
class ConvConcatReluMkldnnFusePassTest_1(ConvConcatReluMkldnnFusePassTest_0):
def set_params(self):
self.conv1_num_filters = 3
self.conv1_filter_size = 3
self.conv1_padding = 0
self.conv1_bias_attr = False
self.conv2_num_filters = 5
self.conv2_filter_size = 5
self.conv2_padding = 1
self.conv2_bias_attr = True
self.concat_axis = 1
self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class MatmulTransposeReshapeMkldnnFusePassTest(InferencePassTest):
def setUp(self):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=self.data_shape, dtype="float32")
weight = fluid.layers.create_parameter(
shape=self.weight_shape, dtype="float32")
matmul = fluid.layers.matmul(
data,
weight,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y)
transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
self.fetch_list = [reshape]
self.enable_mkldnn = True
def set_params(self):
self.data_shape = [-1, 3, 100, 110]
self.weight_shape = [1, 3, 110, 100]
self.feeds = {
"data": np.random.random((1, 3, 100, 110)).astype("float32")
}
self.transpose_x = False
self.transpose_y = False
self.tranpose_perm = [0, 2, 1, 3]
self.reshape_shape = [3, 100, 100]
self.pass_name = 'matmul_transpose_reshape_fuse_pass'
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
def test_pass_compatible(self):
self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
class MatmulTransposeReshapeMkldnnFusePassTest_1(
MatmulTransposeReshapeMkldnnFusePassTest):
def set_params(self):
self.data_shape = [-1, 3, 100, 100]
self.weight_shape = [1, 3, 100, 100]
self.feeds = {
"data": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.transpose_x = True
self.transpose_y = True
self.tranpose_perm = [0, 2, 1, 3]
self.reshape_shape = [6, 50, 100]
self.pass_name = 'matmul_transpose_reshape_fuse_pass'
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from inference_pass_test import InferencePassTest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class ScaleMatmulMkldnnFusePassTest(InferencePassTest):
def setUp(self):
self.set_params()
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[1, 3, 100, 100], dtype="float32")
weight = fluid.layers.create_parameter(
shape=[1, 3, 100, 100], dtype="float32")
scale = fluid.layers.scale(data, scale=self.scale_scale)
matmul = fluid.layers.matmul(
scale,
weight,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y)
self.fetch_list = [matmul]
self.enable_mkldnn = True
def set_params(self):
self.feeds = {
"data": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.scale_scale = 2.0
self.transpose_x = False
self.transpose_y = False
self.pass_name = "scale_matmul_fuse_pass"
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
def test_pass_compatible(self):
self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
class ScaleMatmulMkldnnFusePassTest_1(ScaleMatmulMkldnnFusePassTest):
def set_params(self):
self.feeds = {
"data": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.scale_scale = 5.0
self.transpose_x = True
self.transpose_y = True
self.pass_name = "scale_matmul_fuse_pass"
if __name__ == "__main__":
unittest.main()
...@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest ...@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import AnalysisConfig from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import PassVersionChecker
class ConvBnFusePassMKLDNNTest(InferencePassTest): class SeqConcatFCFusePassTest(InferencePassTest):
def setUp(self): def test_compatible(self):
with fluid.program_guard(self.main_program, self.startup_program): self.assertTrue(
data = fluid.data( PassVersionChecker.IsCompatible('seq_concat_fc_fuse_pass'))
name="data", shape=[-1, 3, 100, 100], dtype="float32")
conv_out = fluid.layers.conv2d(
data, num_filters=3, filter_size=3, bias_attr=False, act="relu")
self.feeds = {
"data": np.random.random((1, 3, 100, 100)).astype("float32")
}
self.fetch_list = [conv_out]
self.enable_mkldnn = True
def test_check_output(self):
use_gpu = False
self.check_output_with_option(use_gpu)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase): ...@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
self.func(p) self.func(p)
class TestAbsDoubleGradCheck(unittest.TestCase):
@prog_scope()
def func(self, place):
# the shape of input variable should be clearly specified, not inlcude -1.
shape = [2, 3, 7, 9]
eps = 1e-6
dtype = np.float64
x = layers.data('x', shape, False, dtype)
x.persistable = True
y = layers.abs(x)
x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
gradient_checker.double_grad_check(
[x], y, x_init=x_arr, place=place, eps=eps)
def test_grad(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
self.func(p)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard ...@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
class TestSqrtOpError(unittest.TestCase): class TestSqrtOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
# The input type of sqrt op must be Variable or numpy.ndarray. # The input type of sqrt op must be Variable or numpy.ndarray.
in1 = 1 in1 = 1
...@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase): ...@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
class TestActivation(OpTest): class TestActivation(OpTest):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "exp" self.op_type = "exp"
self.init_dtype() self.init_dtype()
self.init_kernel_type() self.init_kernel_type()
...@@ -71,6 +73,7 @@ class TestActivation(OpTest): ...@@ -71,6 +73,7 @@ class TestActivation(OpTest):
class TestParameter(object): class TestParameter(object):
def test_out_name(self): def test_out_name(self):
paddle.enable_static()
with fluid.program_guard(fluid.Program()): with fluid.program_guard(fluid.Program()):
np_x = np.array([0.1]) np_x = np.array([0.1])
data = fluid.layers.data(name="X", shape=[1]) data = fluid.layers.data(name="X", shape=[1])
...@@ -92,6 +95,7 @@ class TestParameter(object): ...@@ -92,6 +95,7 @@ class TestParameter(object):
class TestSigmoid(TestActivation): class TestSigmoid(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "sigmoid" self.op_type = "sigmoid"
self.init_dtype() self.init_dtype()
...@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation): ...@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
class TestLogSigmoid(TestActivation): class TestLogSigmoid(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "logsigmoid" self.op_type = "logsigmoid"
self.init_dtype() self.init_dtype()
...@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase): ...@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
class TestTanh(TestActivation, TestParameter): class TestTanh(TestActivation, TestParameter):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "tanh" self.op_type = "tanh"
self.init_dtype() self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
...@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase): ...@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
class TestAtan(TestActivation, TestParameter): class TestAtan(TestActivation, TestParameter):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "atan" self.op_type = "atan"
self.init_dtype() self.init_dtype()
...@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter): ...@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
class TestSinh(TestActivation): class TestSinh(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "sinh" self.op_type = "sinh"
self.init_dtype() self.init_dtype()
...@@ -349,6 +357,7 @@ class TestSinh(TestActivation): ...@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
class TestSinhOpError(unittest.TestCase): class TestSinhOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.sinh, 1) self.assertRaises(TypeError, fluid.layers.sinh, 1)
...@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase): ...@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
class TestCosh(TestActivation): class TestCosh(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "cosh" self.op_type = "cosh"
self.init_dtype() self.init_dtype()
...@@ -420,6 +430,7 @@ class TestCosh(TestActivation): ...@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
class TestCoshOpError(unittest.TestCase): class TestCoshOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.cosh, 1) self.assertRaises(TypeError, fluid.layers.cosh, 1)
...@@ -438,6 +449,7 @@ def ref_tanhshrink(x): ...@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
class TestTanhshrink(TestActivation): class TestTanhshrink(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "tanh_shrink" self.op_type = "tanh_shrink"
self.init_dtype() self.init_dtype()
...@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold): ...@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
class TestHardShrink(TestActivation): class TestHardShrink(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "hard_shrink" self.op_type = "hard_shrink"
self.init_dtype() self.init_dtype()
...@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink): ...@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
class TestHardShrinkAPI(unittest.TestCase): class TestHardShrinkAPI(unittest.TestCase):
# test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
def setUp(self): def setUp(self):
paddle.enable_static()
self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32') self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \ self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
else paddle.CPUPlace() else paddle.CPUPlace()
...@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5): ...@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
class TestSoftshrink(TestActivation): class TestSoftshrink(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "softshrink" self.op_type = "softshrink"
self.init_dtype() self.init_dtype()
...@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase): ...@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
class TestSqrt(TestActivation, TestParameter): class TestSqrt(TestActivation, TestParameter):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "sqrt" self.op_type = "sqrt"
self.init_dtype() self.init_dtype()
...@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter): ...@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
class TestRsqrt(TestActivation): class TestRsqrt(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "rsqrt" self.op_type = "rsqrt"
self.init_dtype() self.init_dtype()
...@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation): ...@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
class TestAbs(TestActivation): class TestAbs(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "abs" self.op_type = "abs"
self.init_dtype() self.init_dtype()
...@@ -792,6 +810,7 @@ class TestAbs(TestActivation): ...@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
class TestCeil(TestActivation): class TestCeil(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "ceil" self.op_type = "ceil"
self.init_dtype() self.init_dtype()
...@@ -808,6 +827,7 @@ class TestCeil(TestActivation): ...@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
class TestFloor(TestActivation): class TestFloor(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "floor" self.op_type = "floor"
self.init_dtype() self.init_dtype()
...@@ -826,6 +846,7 @@ class TestFloor(TestActivation): ...@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
class TestCos(TestActivation): class TestCos(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "cos" self.op_type = "cos"
self.init_dtype() self.init_dtype()
...@@ -843,6 +864,7 @@ class TestCos(TestActivation): ...@@ -843,6 +864,7 @@ class TestCos(TestActivation):
class TestAcos(TestActivation): class TestAcos(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "acos" self.op_type = "acos"
self.init_dtype() self.init_dtype()
...@@ -860,6 +882,7 @@ class TestAcos(TestActivation): ...@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
class TestSin(TestActivation, TestParameter): class TestSin(TestActivation, TestParameter):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "sin" self.op_type = "sin"
self.init_dtype() self.init_dtype()
...@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter): ...@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
class TestAsin(TestActivation): class TestAsin(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "asin" self.op_type = "asin"
self.init_dtype() self.init_dtype()
...@@ -894,6 +918,7 @@ class TestAsin(TestActivation): ...@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
class TestRound(TestActivation): class TestRound(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "round" self.op_type = "round"
self.init_dtype() self.init_dtype()
...@@ -909,6 +934,7 @@ class TestRound(TestActivation): ...@@ -909,6 +934,7 @@ class TestRound(TestActivation):
class TestRelu(TestActivation): class TestRelu(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "relu" self.op_type = "relu"
self.init_dtype() self.init_dtype()
...@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation): ...@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
return 0.02 return 0.02
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "leaky_relu" self.op_type = "leaky_relu"
self.init_dtype() self.init_dtype()
alpha = self.get_alpha() alpha = self.get_alpha()
...@@ -1084,6 +1111,7 @@ def gelu(x, approximate): ...@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
class TestGeluApproximate(TestActivation): class TestGeluApproximate(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "gelu" self.op_type = "gelu"
self.init_dtype() self.init_dtype()
approximate = True approximate = True
...@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation): ...@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
class TestGelu(TestActivation): class TestGelu(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "gelu" self.op_type = "gelu"
self.init_dtype() self.init_dtype()
approximate = False approximate = False
...@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase): ...@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
class TestBRelu(TestActivation): class TestBRelu(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "brelu" self.op_type = "brelu"
self.init_dtype() self.init_dtype()
...@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation): ...@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
class TestBReluOpError(unittest.TestCase): class TestBReluOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.brelu, 1) self.assertRaises(TypeError, fluid.layers.brelu, 1)
...@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0): ...@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
class TestRelu6(TestActivation): class TestRelu6(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "relu6" self.op_type = "relu6"
self.init_dtype() self.init_dtype()
...@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase): ...@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
class TestHardSwish(TestActivation): class TestHardSwish(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = 'hard_swish' self.op_type = 'hard_swish'
self.init_dtype() self.init_dtype()
...@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation): ...@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
class TestHardSwishOpError(unittest.TestCase): class TestHardSwishOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.hard_swish, 1) self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
...@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase): ...@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
class TestSoftRelu(TestActivation): class TestSoftRelu(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "soft_relu" self.op_type = "soft_relu"
self.init_dtype() self.init_dtype()
...@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation): ...@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
class TestSoftReluOpError(unittest.TestCase): class TestSoftReluOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.soft_relu, 1) self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
...@@ -1366,6 +1402,7 @@ def elu(x, alpha): ...@@ -1366,6 +1402,7 @@ def elu(x, alpha):
class TestELU(TestActivation): class TestELU(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "elu" self.op_type = "elu"
self.init_dtype() self.init_dtype()
...@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase): ...@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
class TestReciprocal(TestActivation): class TestReciprocal(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "reciprocal" self.op_type = "reciprocal"
self.init_dtype() self.init_dtype()
...@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation): ...@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
class TestLog(TestActivation): class TestLog(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "log" self.op_type = "log"
self.init_dtype() self.init_dtype()
...@@ -1478,6 +1517,7 @@ class TestLog(TestActivation): ...@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
class TestLog1p(TestActivation): class TestLog1p(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "log1p" self.op_type = "log1p"
self.init_dtype() self.init_dtype()
...@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation): ...@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
class TestSquare(TestActivation): class TestSquare(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "square" self.op_type = "square"
self.init_dtype() self.init_dtype()
...@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation): ...@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
class TestPow(TestActivation): class TestPow(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "pow" self.op_type = "pow"
self.init_dtype() self.init_dtype()
...@@ -1557,6 +1599,7 @@ class TestPow(TestActivation): ...@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
class TestPow_factor_tensor(TestActivation): class TestPow_factor_tensor(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "pow" self.op_type = "pow"
self.init_dtype() self.init_dtype()
...@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation): ...@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
class TestSTanh(TestActivation): class TestSTanh(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "stanh" self.op_type = "stanh"
self.init_dtype() self.init_dtype()
...@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation): ...@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
class TestSTanhOpError(unittest.TestCase): class TestSTanhOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.stanh, 1) self.assertRaises(TypeError, fluid.layers.stanh, 1)
...@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20): ...@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
class TestSoftplus(TestActivation): class TestSoftplus(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "softplus" self.op_type = "softplus"
self.init_dtype() self.init_dtype()
...@@ -1751,6 +1797,7 @@ def ref_softsign(x): ...@@ -1751,6 +1797,7 @@ def ref_softsign(x):
class TestSoftsign(TestActivation): class TestSoftsign(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "softsign" self.op_type = "softsign"
self.init_dtype() self.init_dtype()
...@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase): ...@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
class TestThresholdedRelu(TestActivation): class TestThresholdedRelu(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "thresholded_relu" self.op_type = "thresholded_relu"
self.init_dtype() self.init_dtype()
...@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation): ...@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
class TestThresholdedReluOpError(unittest.TestCase): class TestThresholdedReluOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1) self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
...@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase): ...@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
class TestHardSigmoid(TestActivation): class TestHardSigmoid(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "hard_sigmoid" self.op_type = "hard_sigmoid"
self.init_dtype() self.init_dtype()
...@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation): ...@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
class TestHardSigmoidOpError(unittest.TestCase): class TestHardSigmoidOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1) self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
...@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase): ...@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
class TestSwish(TestActivation): class TestSwish(TestActivation):
def setUp(self): def setUp(self):
paddle.enable_static()
self.op_type = "swish" self.op_type = "swish"
self.init_dtype() self.init_dtype()
...@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation): ...@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
class TestSwishOpError(unittest.TestCase): class TestSwishOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
paddle.enable_static()
with program_guard(Program()): with program_guard(Program()):
# The input type must be Variable. # The input type must be Variable.
self.assertRaises(TypeError, fluid.layers.swish, 1) self.assertRaises(TypeError, fluid.layers.swish, 1)
......
...@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase): ...@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
fetch_val2, = exe.run(compiled_prog, fetch_val2, = exe.run(compiled_prog,
feed=feed_dict, feed=feed_dict,
fetch_list=[fetch_var]) fetch_list=[fetch_var])
#NOTE(zhiqiu): Temporally changed from array_equal to allclose.
self.assertTrue(np.array_equal(fetch_val1, fetch_val2)) # The real root is fuse_all_reduce and fuse_all_optimizer_opss may
# result in diff because of the instruction set on the virtual machine.
# And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
# There are also some related issues:
# https://github.com/PaddlePaddle/Paddle/issues/21270
# https://github.com/PaddlePaddle/Paddle/issues/21046
# https://github.com/PaddlePaddle/Paddle/issues/21045
self.assertTrue(np.allclose(fetch_val1, fetch_val2))
def check_multi_card_fetch_var(self): def check_multi_card_fetch_var(self):
if self.is_invalid_test(): if self.is_invalid_test():
...@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase): ...@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
fetch_vals.append(fetch_val) fetch_vals.append(fetch_val)
for item in fetch_vals: for item in fetch_vals:
self.assertTrue(np.array_equal(fetch_vals[0], item)) # save above
self.assertTrue(np.allclose(fetch_vals[0], item))
class CUDAInplaceTest(InplaceTestBase): class CUDAInplaceTest(InplaceTestBase):
......
...@@ -11,30 +11,23 @@ ...@@ -11,30 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Check whether ut is disabled. """
import os from __future__ import print_function
import sys import unittest
from test_dist_base import TestDistBase
def check_ut(): class TestDistMnist2x2FP16AllReduce(TestDistBase):
""" Get disabled unit tests. """ def _setup_config(self):
disable_ut_file = 'disable_ut' self._sync_mode = True
cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format( self._use_reduce = False
disable_ut_file) self._nccl2_mode = True
os.system(cmd)
with open(disable_ut_file) as utfile:
for u in utfile:
if u.rstrip('\r\n') == sys.argv[1]:
exit(0)
exit(1)
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist_fp16_allreduce.py", delta=1e-5)
if __name__ == '__main__':
if len(sys.argv) != 2: if __name__ == "__main__":
exit(1) unittest.main()
try:
check_ut()
except Exception as e:
print(e)
exit(1)
...@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase): ...@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
strategy.dgc = "True" strategy.dgc = "True"
self.assertEqual(strategy.dgc, False) self.assertEqual(strategy.dgc, False)
def test_fp16_allreduce(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.fp16_allreduce = True
self.assertEqual(strategy.fp16_allreduce, True)
strategy.fp16_allreduce = False
self.assertEqual(strategy.fp16_allreduce, False)
with self.assertRaises(TypeError):
strategy.fp16_allreduce = "True"
self.assertEqual(strategy.fp16_allreduce, False)
def test_sync_nccl_allreduce(self): def test_sync_nccl_allreduce(self):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.sync_nccl_allreduce = True strategy.sync_nccl_allreduce = True
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import unittest
import paddle
import paddle.fluid as fluid
import os
paddle.enable_static()
class TestFleetFP16CompressOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
def net(self, main_prog, startup_prog, dtype='float32'):
with fluid.program_guard(main_prog, startup_prog):
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype=dtype)
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.fp16_allreduce = True
return avg_cost, strategy
def test_fp16_allreduce_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
cast_out = [
op.output('Out')[0] for op in avg_cost.block.ops
if op.type == 'cast'
]
cast_op_count = 0
for name in ops:
if name == 'cast':
cast_op_count += 1
self.assertIn('cast', ops)
self.assertEqual(cast_op_count, 12) # 6 + 6, cast_fp16 + cast_fp32
for name in cast_out:
self.assertIn('cast_fp16', name)
def test_fp16_allreduce_not_apply_fp16_net(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16')
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('cast', ops)
if __name__ == "__main__":
unittest.main()
...@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase): ...@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
x = np.random.randn(*shape).astype("float32") x = np.random.randn(*shape).astype("float32")
y1 = compute_v1(x) y1 = compute_v1(x)
y2 = compute_v2(x) y2 = compute_v2(x)
self.assertTrue(np.allclose(y1, y2)) result = np.allclose(y1, y2)
if not result:
print("y1:", y1, "\ty2:", y2)
self.assertTrue(result)
test_weight_bias_false() test_weight_bias_false()
def test_static(self): def test_static(self):
......
...@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear ...@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
if fluid.is_compiled_with_cuda():
fluid.set_flags({'FLAGS_cudnn_deterministic': True})
batch_size = 8 batch_size = 8
train_parameters = { train_parameters = {
"input_size": [3, 224, 224], "input_size": [3, 224, 224],
...@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
label.stop_gradient = True label.stop_gradient = True
out = se_resnext(img) out = se_resnext(img)
loss = fluid.layers.cross_entropy(input=out, label=label) softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(
input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss) avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy() dy_out = avg_loss.numpy()
...@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
name='pixel', shape=[3, 224, 224], dtype='float32') name='pixel', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = se_resnext(img) out = se_resnext(img)
loss = fluid.layers.cross_entropy(input=out, label=label) softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss) avg_loss = fluid.layers.mean(x=loss)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
...@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
static_grad_value[static_grad_name_list[ static_grad_value[static_grad_name_list[
i - grad_start_pos]] = out[i] i - grad_start_pos]] = out[i]
self.assertTrue(np.allclose(static_out, dy_out)) self.assertTrue(
np.allclose(static_out, dy_out),
"\nstatic_out: {}\ndy_out: {}".format(static_out, dy_out))
self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
...@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase): ...@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
self.assertEqual(len(dy_grad_value), len(static_grad_value)) self.assertEqual(len(dy_grad_value), len(static_grad_value))
for key, value in six.iteritems(static_grad_value): for key, value in six.iteritems(static_grad_value):
self.assertTrue(np.allclose(value, dy_grad_value[key])) self.assertTrue(
np.allclose(value, dy_grad_value[key]),
"\nstatic_grad_value: {}\ndy_grad_value: {}".format(
value, dy_grad_value[key]))
self.assertTrue(np.isfinite(value.all())) self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any())) self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_param_value), len(static_param_value)) self.assertEqual(len(dy_param_value), len(static_param_value))
for key, value in six.iteritems(static_param_value): for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key])) self.assertTrue(
np.allclose(value, dy_param_value[key]),
"\nstatic_param_value: {}\ndy_param_value: {}".format(
value, dy_param_value[key]))
self.assertTrue(np.isfinite(value.all())) self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any())) self.assertFalse(np.isnan(value.any()))
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest): ...@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
self.y_shape = (100, ) self.y_shape = (100, )
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
def init_kernel_type(self):
self.dtype = "float64" self.dtype = "float64"
def setUp(self): def setUp(self):
self.init_kernel_type()
self.config() self.config()
self.op_type = "matmul_v2" self.op_type = "matmul_v2"
x = np.random.random(self.x_shape).astype(self.dtype) x = np.random.random(self.x_shape).astype(self.dtype)
y = np.random.random(self.y_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype)
# -0.1 ~ 0.1
x = -0.1 + 0.2 * x
y = -0.1 + 0.2 * y
result = reference_matmul(x, y, self.trans_x, self.trans_y) result = reference_matmul(x, y, self.trans_x, self.trans_y)
result = result.astype(self.dtype)
self.inputs = { self.inputs = {
'X': x, 'X': x,
'Y': y, 'Y': y,
...@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op): ...@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
self.y_shape = (1, 3, 2, 100) self.y_shape = (1, 3, 2, 100)
self.trans_x = False self.trans_x = False
self.trans_y = True self.trans_y = True
self.dtype = "float64"
class TestMatMuklOp3(TestMatMulV2Op): class TestMatMuklOp3(TestMatMulV2Op):
...@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op): ...@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
self.y_shape = (1, 1, 100, 2) self.y_shape = (1, 1, 100, 2)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp4(TestMatMulV2Op): class TestMatMuklOp4(TestMatMulV2Op):
...@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op): ...@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
self.y_shape = (1, 2, 100, 2) self.y_shape = (1, 2, 100, 2)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp5(TestMatMulV2Op): class TestMatMuklOp5(TestMatMulV2Op):
...@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op): ...@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
""" """
def config(self): def config(self):
self.x_shape = (1, 1, 100, 2) self.x_shape = (1, 1, 100, 1)
self.y_shape = (100, ) self.y_shape = (100, )
self.trans_x = True self.trans_x = True
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp6(TestMatMulV2Op): class TestMatMuklOp6(TestMatMulV2Op):
...@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op): ...@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
self.y_shape = (100, ) self.y_shape = (100, )
self.trans_x = True self.trans_x = True
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp7(TestMatMulV2Op): class TestMatMuklOp7(TestMatMulV2Op):
...@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op): ...@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
self.y_shape = (100, ) self.y_shape = (100, )
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp8(TestMatMulV2Op): class TestMatMuklOp8(TestMatMulV2Op):
...@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op): ...@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
self.y_shape = (1, 1, 100, 2) self.y_shape = (1, 1, 100, 2)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp9(TestMatMulV2Op): class TestMatMuklOp9(TestMatMulV2Op):
...@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op): ...@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
self.y_shape = (2, 1, 2, 100) self.y_shape = (2, 1, 2, 100)
self.trans_x = False self.trans_x = False
self.trans_y = True self.trans_y = True
self.dtype = "float64"
class TestMatMuklOp10(TestMatMulV2Op): class TestMatMuklOp10(TestMatMulV2Op):
...@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op): ...@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
""" """
def config(self): def config(self):
self.x_shape = (1, 1, 2, 100) self.x_shape = (1, 1, 25, 4)
self.y_shape = (1, 2, 100, 2) self.y_shape = (1, 2, 4, 25)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp11(TestMatMulV2Op): class TestMatMuklOp11(TestMatMulV2Op):
...@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op): ...@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
self.y_shape = (1, 1, 100, 2) self.y_shape = (1, 1, 100, 2)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp12(TestMatMulV2Op): class TestMatMuklOp12(TestMatMulV2Op):
...@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op): ...@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
""" """
def config(self): def config(self):
self.x_shape = (2, 1, 100, 2) self.x_shape = (2, 1, 4, 25)
self.y_shape = (1, 1, 100, 2) self.y_shape = (1, 1, 4, 25)
self.trans_x = True self.trans_x = True
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp13(TestMatMulV2Op): class TestMatMuklOp13(TestMatMulV2Op):
...@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op): ...@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
""" """
def config(self): def config(self):
self.x_shape = (2, 2, 100, 2) self.x_shape = (2, 2, 2, 50)
self.y_shape = (2, 2, 100, 2) self.y_shape = (2, 2, 2, 50)
self.trans_x = True self.trans_x = True
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp14(TestMatMulV2Op): class TestMatMuklOp14(TestMatMulV2Op):
...@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op): ...@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
self.y_shape = (1, 2, 2, 100, 2) self.y_shape = (1, 2, 2, 100, 2)
self.trans_x = True self.trans_x = True
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp15(TestMatMulV2Op): class TestMatMuklOp15(TestMatMulV2Op):
...@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op): ...@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
self.y_shape = (1, 2, 2, 100, 1) self.y_shape = (1, 2, 2, 100, 1)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp16(TestMatMulV2Op): class TestMatMuklOp16(TestMatMulV2Op):
...@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op): ...@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
def config(self): def config(self):
self.x_shape = (100) self.x_shape = (100)
self.y_shape = (1, 2, 2, 100, 1) self.y_shape = (1, 2, 2, 100, 2)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
class TestMatMuklOp17(TestMatMulV2Op): class TestMatMuklOp17(TestMatMulV2Op):
...@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op): ...@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
self.y_shape = (100) self.y_shape = (100)
self.trans_x = False self.trans_x = False
self.trans_y = False self.trans_y = False
self.dtype = "float64"
#--------------------test matmul fp16--------------------
def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMatMulOpFp16Case(parent):
def init_kernel_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=atol)
def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X', 'Y'],
'Out',
max_relative_error=max_relative_error)
cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
TestMatMulOpFp16Case.__name__ = cls_name
globals()[cls_name] = TestMatMulOpFp16Case
create_test_fp16_class(TestMatMulV2Op)
create_test_fp16_class(TestMatMuklOp2)
create_test_fp16_class(TestMatMuklOp3)
create_test_fp16_class(TestMatMuklOp4)
create_test_fp16_class(TestMatMuklOp5)
create_test_fp16_class(TestMatMuklOp6)
create_test_fp16_class(TestMatMuklOp7)
create_test_fp16_class(TestMatMuklOp8)
create_test_fp16_class(TestMatMuklOp9)
create_test_fp16_class(TestMatMuklOp10)
create_test_fp16_class(TestMatMuklOp11)
create_test_fp16_class(TestMatMuklOp12)
create_test_fp16_class(TestMatMuklOp13)
create_test_fp16_class(TestMatMuklOp14)
create_test_fp16_class(TestMatMuklOp15)
create_test_fp16_class(TestMatMuklOp16)
create_test_fp16_class(TestMatMuklOp17)
class TestMatMulV2API(unittest.TestCase): class TestMatMulV2API(unittest.TestCase):
...@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase): ...@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
y = paddle.to_tensor(input_y) y = paddle.to_tensor(input_y)
result = paddle.matmul(x, y) result = paddle.matmul(x, y)
def test_dygraph_fp16(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
with fluid.dygraph.guard(place):
input_x = np.random.random([4, 3]).astype("float16")
input_y = np.random.random([3, 4]).astype("float16")
x = paddle.to_tensor(input_x)
y = paddle.to_tensor(input_y)
result = paddle.matmul(x, y)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase): ...@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM) dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
dataloader = DataLoader( dataloader = DataLoader(
dataset, dataset,
places=places,
num_workers=num_workers, num_workers=num_workers,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
drop_last=True) drop_last=True)
......
...@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase): ...@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM) dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
dataloader = DataLoader( dataloader = DataLoader(
dataset, dataset,
places=places,
num_workers=num_workers, num_workers=num_workers,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
drop_last=True) drop_last=True)
......
...@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck): ...@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
self.shape = [2, 2, 3, 4, 5] self.shape = [2, 2, 3, 4, 5]
class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
@prog_scope()
def func(self, place):
prog = fluid.Program()
with fluid.program_guard(prog):
np.random.seed()
dtype = "float32"
eps = 0.005
atol = 2e-4
chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
-1]
x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
z = fluid.layers.batch_norm(
input=x,
data_layout=self.data_layout,
use_global_stats=self.use_global_stats)
x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
w, b = prog.global_block().all_parameters()[1:3]
w_arr = np.ones(chn).astype(dtype)
b_arr = np.zeros(chn).astype(dtype)
gradient_checker.double_grad_check(
[x, w, b],
z,
x_init=[x_arr, w_arr, b_arr],
atol=atol,
place=place,
eps=eps)
class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
def init_test(self):
self.data_layout = 'NCHW'
self.use_global_stats = True
self.shape = [2, 3, 4, 5]
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -29,19 +29,23 @@ IMAGE_SIZE = 784 ...@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
CLASS_NUM = 10 CLASS_NUM = 10
# define a random dataset def random_batch_reader():
class RandomDataset(paddle.io.Dataset): def _get_random_inputs_and_labels():
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
np.random.seed(SEED) np.random.seed(SEED)
image = np.random.random([IMAGE_SIZE]).astype('float32') image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') label = np.random.randint(0, CLASS_NUM - 1, (
BATCH_SIZE,
1, )).astype('int64')
return image, label return image, label
def __len__(self): def __reader__():
return self.num_samples for _ in range(BATCH_NUM):
batch_image, batch_label = _get_random_inputs_and_labels()
batch_image = paddle.to_tensor(batch_image)
batch_label = paddle.to_tensor(batch_label)
yield batch_image, batch_label
return __reader__
class LinearNet(nn.Layer): class LinearNet(nn.Layer):
...@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt): ...@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
class TestSaveLoad(unittest.TestCase): class TestSaveLoad(unittest.TestCase):
def setUp(self): def setUp(self):
# enable dygraph mode # enable dygraph mode
self.place = paddle.CPUPlace() paddle.disable_static()
paddle.disable_static(self.place)
# config seed # config seed
paddle.manual_seed(SEED) paddle.manual_seed(SEED)
...@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase): ...@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters()) adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
# create data loader # create data loader
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) # TODO: using new DataLoader cause unknown Timeout on windows, replace it
loader = paddle.io.DataLoader( loader = random_batch_reader()
dataset,
places=self.place,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
# train # train
train(layer, loader, loss_fn, adam) train(layer, loader, loss_fn, adam)
...@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase): ...@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
layer, opt = self.build_and_train_model() layer, opt = self.build_and_train_model()
# save # save
layer_save_path = "linear.pdparams" layer_save_path = "test_paddle_save_load.linear.pdparams"
opt_save_path = "linear.pdopt" opt_save_path = "test_paddle_save_load.linear.pdopt"
layer_state_dict = layer.state_dict() layer_state_dict = layer.state_dict()
opt_state_dict = opt.state_dict() opt_state_dict = opt.state_dict()
...@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase): ...@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
# test save load in static mode # test save load in static mode
paddle.enable_static() paddle.enable_static()
static_save_path = "static_mode_test/linear.pdparams" static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
paddle.save(layer_state_dict, static_save_path) paddle.save(layer_state_dict, static_save_path)
load_static_state_dict = paddle.load(static_save_path) load_static_state_dict = paddle.load(static_save_path)
self.check_load_state_dict(layer_state_dict, load_static_state_dict) self.check_load_state_dict(layer_state_dict, load_static_state_dict)
...@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase): ...@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
# 2. test save path format error # 2. test save path format error
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.save(layer_state_dict, "linear.model/") paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
# 3. test load path not exist error # 3. test load path not exist error
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.load("linear.params") paddle.load("test_paddle_save_load.linear.params")
# 4. test load old save path error # 4. test load old save path error
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
paddle.load("linear") paddle.load("test_paddle_save_load.linear")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): ...@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
def __check_input(x, y): def __check_input(x, y):
var_names = {'x': x, 'y': y} var_names = {'x': x, 'y': y}
for name, val in var_names.items(): for name, val in var_names.items():
check_variable_and_dtype(val, name, ['float32', 'float64'], check_variable_and_dtype(
'matmul') val, name, ['float16', 'float32', 'float64'], 'matmul')
__check_input(x, y) __check_input(x, y)
......
...@@ -286,7 +286,7 @@ fi ...@@ -286,7 +286,7 @@ fi
# Get the list of PR authors with unresolved unit test issues # Get the list of PR authors with unresolved unit test issues
pip install PyGithub pip install PyGithub
# For getting PR related data # For getting PR related data
wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true` HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
if [ "${HASUTFIXED}" != "" ]; then if [ "${HASUTFIXED}" != "" ]; then
......
...@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ ...@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
make -j8 && make install && \ make -j8 && make install && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
# gcc4.8 TRT # Downgrade gcc&&g++
RUN mkdir -p /opt/compiler && cd /opt/compiler && \ <install_gcc>
wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"] CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
...@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){ ...@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
function make_centos_dockerfile(){ function make_centos_dockerfile(){
dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6" dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name} sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name}
dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'` dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \ sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
...@@ -29,6 +29,15 @@ function make_centos_dockerfile(){ ...@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
rm -rf /usr/include/NvInfer*" ${dockerfile_name} rm -rf /usr/include/NvInfer*" ${dockerfile_name}
sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \ sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
sed -i 's#<install_gcc>#WORKDIR /usr/bin \
COPY tools/dockerfile/build_scripts /build_scripts \
RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
} }
......
...@@ -534,13 +534,6 @@ def get_incrementapi(): ...@@ -534,13 +534,6 @@ def get_incrementapi():
f.write('\n') f.write('\n')
# only white on CPU
gpu_not_white = [
"deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
"cuda_profiler", 'DGCMomentumOptimizer'
]
def get_wlist(): def get_wlist():
''' '''
this function will get the white list of API. this function will get the white list of API.
...@@ -552,17 +545,25 @@ def get_wlist(): ...@@ -552,17 +545,25 @@ def get_wlist():
''' '''
wlist = [] wlist = []
wlist_file = [] wlist_file = []
# only white on CPU
gpu_not_white = []
with open("wlist.json", 'r') as load_f: with open("wlist.json", 'r') as load_f:
load_dict = json.load(load_f) load_dict = json.load(load_f)
for key in load_dict: for key in load_dict:
if key == 'wlist_file': if key == 'wlist_dir':
wlist_file = wlist_file + load_dict[key] for item in load_dict[key]:
wlist_file.append(item["name"])
elif key == "gpu_not_white":
gpu_not_white = load_dict[key]
elif key == "wlist_api":
for item in load_dict[key]:
wlist.append(item["name"])
else: else:
wlist = wlist + load_dict[key] wlist = wlist + load_dict[key]
return wlist, wlist_file return wlist, wlist_file, gpu_not_white
wlist, wlist_file = get_wlist() wlist, wlist_file, gpu_not_white = get_wlist()
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("Error: inadequate number of arguments") print("Error: inadequate number of arguments")
......
{ {
"wlist_file" : [ "wlist_dir":[
"../python/paddle/fluid/contrib", {
"../python/paddle/verison.py", "name":"../python/paddle/fluid/contrib",
"../python/paddle/fluid/core_avx.py", "annotation":""
"../python/paddle/distributed" },
{
"name":"../python/paddle/verison.py",
"annotation":""
},
{
"name":"../python/paddle/fluid/core_avx.py",
"annotation":""
},
{
"name":"../python/paddle/distributed",
"annotation":""
}
], ],
"wlist_inneed":[ "wlist_api":[
{
"name":"xxxxx",
"annotation":"not a real api, just for example"
}
],
"wlist_temp_api":[
"append_LARS", "append_LARS",
"BuildStrategy.debug_graphviz_path", "BuildStrategy.debug_graphviz_path",
"BuildStrategy.enable_sequential_execution", "BuildStrategy.enable_sequential_execution",
...@@ -63,9 +81,7 @@ ...@@ -63,9 +81,7 @@
"cuda_places", "cuda_places",
"CUDAPinnedPlace", "CUDAPinnedPlace",
"CUDAPlace", "CUDAPlace",
"Program.parse_from_string" "Program.parse_from_string",
],
"wlist_nosample":[
"Compressor", "Compressor",
"Compressor.config", "Compressor.config",
"Compressor.run", "Compressor.run",
...@@ -159,13 +175,9 @@ ...@@ -159,13 +175,9 @@
"RNN", "RNN",
"BiRNN", "BiRNN",
"RNNCellBase", "RNNCellBase",
"RNNCellBase.get_initial_states" "RNNCellBase.get_initial_states",
],
"wlist_no_op_pass":[
"gelu", "gelu",
"erf" "erf",
],
"wlist_ci_nopass":[
"DecodeHelper", "DecodeHelper",
"DecodeHelper.initialize", "DecodeHelper.initialize",
"DecodeHelper.sample", "DecodeHelper.sample",
...@@ -188,9 +200,7 @@ ...@@ -188,9 +200,7 @@
"SampleEmbeddingHelper", "SampleEmbeddingHelper",
"BasicDecoder", "BasicDecoder",
"lstm", "lstm",
"partial_sum" "partial_sum",
],
"wlist_nopass":[
"StateCell", "StateCell",
"StateCell.compute_state", "StateCell.compute_state",
"TrainingDecoder", "TrainingDecoder",
...@@ -242,9 +252,7 @@ ...@@ -242,9 +252,7 @@
"GroupNorm", "GroupNorm",
"SpectralNorm", "SpectralNorm",
"TreeConv", "TreeConv",
"prroi_pool" "prroi_pool",
],
"wlist_temp":[
"to_tensor", "to_tensor",
"ChunkEvaluator", "ChunkEvaluator",
"EditDistance", "EditDistance",
...@@ -322,9 +330,7 @@ ...@@ -322,9 +330,7 @@
"Conv2DTranspose", "Conv2DTranspose",
"QueueDataset.local_shuffle", "QueueDataset.local_shuffle",
"save_persistables@dygraph/checkpoint.py", "save_persistables@dygraph/checkpoint.py",
"load_persistables@dygraph/checkpoint.py" "load_persistables@dygraph/checkpoint.py",
],
"wlist_ignore":[
"elementwise_pow", "elementwise_pow",
"WeightedAverage.reset", "WeightedAverage.reset",
"ChunkEvaluator.eval", "ChunkEvaluator.eval",
...@@ -401,5 +407,13 @@ ...@@ -401,5 +407,13 @@
"LinearChainCRF.forward", "LinearChainCRF.forward",
"CRFDecoding.forward", "CRFDecoding.forward",
"SequenceTagging.forward" "SequenceTagging.forward"
],
"gpu_not_white":[
"deformable_conv",
"cuda_places",
"CUDAPinnedPlace",
"CUDAPlace",
"cuda_profiler",
"DGCMomentumOptimizer"
] ]
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册