提交 96aa0973 编写于 作者: S sandyhouse

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_timeline

 
<p align="center"> <p align="center">
<img align="center" src="doc/imgs/logo.png", width=1600> <img align="center" src="doc/imgs/logo.png", width=1600>
<p> <p>
......
...@@ -60,9 +60,8 @@ else() ...@@ -60,9 +60,8 @@ else()
set(CUDNN_FOUND OFF) set(CUDNN_FOUND OFF)
endif() endif()
if(CUDNN_FOUND) macro(find_cudnn_version cudnn_header_file)
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS)
get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
...@@ -93,10 +92,15 @@ if(CUDNN_FOUND) ...@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
math(EXPR CUDNN_VERSION math(EXPR CUDNN_VERSION
"${CUDNN_MAJOR_VERSION} * 1000 + "${CUDNN_MAJOR_VERSION} * 1000 +
${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
message(STATUS "Current cuDNN header is ${cudnn_header_file} "
"Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
endif() endif()
message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
"Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
endif() endif()
endmacro()
if(CUDNN_FOUND)
find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h)
if (NOT CUDNN_MAJOR_VERSION)
find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h)
endif()
endif() endif()
...@@ -43,6 +43,12 @@ message GradientMergeConfig { ...@@ -43,6 +43,12 @@ message GradientMergeConfig {
optional bool avg = 2 [ default = true ]; optional bool avg = 2 [ default = true ];
} }
message DGCConfig {
optional int32 rampup_begin_step = 1 [ default = 0 ];
optional int32 rampup_step = 2 [ default = 1 ];
repeated float sparsity = 3;
}
message LarsConfig { message LarsConfig {
optional float lars_coeff = 1 [ default = 0.001 ]; optional float lars_coeff = 1 [ default = 0.001 ];
optional float lars_weight_decay = 2 [ default = 0.0005 ]; optional float lars_weight_decay = 2 [ default = 0.0005 ];
...@@ -114,6 +120,7 @@ message DistributedStrategy { ...@@ -114,6 +120,7 @@ message DistributedStrategy {
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
optional LocalSGDConfig localsgd_configs = 103; optional LocalSGDConfig localsgd_configs = 103;
optional GradientMergeConfig gradient_merge_configs = 104; optional GradientMergeConfig gradient_merge_configs = 104;
optional DGCConfig dgc_configs = 105;
optional PipelineConfig pipeline_configs = 106; optional PipelineConfig pipeline_configs = 106;
optional AsyncConfig a_sync_configs = 107; optional AsyncConfig a_sync_configs = 107;
optional LarsConfig lars_configs = 108; optional LarsConfig lars_configs = 108;
......
...@@ -885,12 +885,14 @@ void PartialGradTask::RunEachOp(OpBase *op) { ...@@ -885,12 +885,14 @@ void PartialGradTask::RunEachOp(OpBase *op) {
if (create_graph_) { if (create_graph_) {
auto double_grad_node = CreateGradOpNode(op->InnerOp(), tmp_ins, tmp_outs, auto double_grad_node = CreateGradOpNode(op->InnerOp(), tmp_ins, tmp_outs,
op->Attrs(), op->place()); op->Attrs(), op->place());
if (double_grad_node) { PADDLE_ENFORCE_NOT_NULL(
VLOG(10) << "Create " << double_grad_node->size() double_grad_node,
<< " double grad op(s) for " << op->Type() platform::errors::NotFound("The Op %s doesn't have any grad op.",
<< ", pending ops: " << GradPendingOpTypes(*double_grad_node); op->Type()));
double_grad_nodes_.emplace_back(std::move(double_grad_node)); VLOG(10) << "Create " << double_grad_node->size()
} << " double grad op(s) for " << op->Type()
<< ", pending ops: " << GradPendingOpTypes(*double_grad_node);
double_grad_nodes_.emplace_back(std::move(double_grad_node));
} }
VLOG(10) << "There are " << grads_to_accumulate_.size() << " to sum gradient"; VLOG(10) << "There are " << grads_to_accumulate_.size() << " to sum gradient";
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
auto use_fp16 = engine_->WithFp16();
plugin::DynamicPluginTensorRT* plugin = nullptr; plugin::DynamicPluginTensorRT* plugin = nullptr;
if (use_fp16) { plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
#ifdef SUPPORTS_CUDA_FP16 input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<half>( eps);
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
#else
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
#endif
} else {
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
}
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -200,9 +200,23 @@ class TensorRTEngine { ...@@ -200,9 +200,23 @@ class TensorRTEngine {
void Deserialize(const std::string& engine_serialized_data) { void Deserialize(const std::string& engine_serialized_data) {
freshDeviceId(); freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_)); infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
infer_engine_.reset(runtime->deserializeCudaEngine( if (with_dynamic_shape_) {
engine_serialized_data.c_str(), engine_serialized_data.size(), #if IS_TRT_VERSION_GE(6000)
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global())); infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
nullptr));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"To enable dynamic shape support, the TensorRT version should be "
"greater than 6.0.0"));
#endif
} else {
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!"); "build cuda engine failed when deserialize engine info.!");
} }
......
...@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { ...@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
static nvinfer1::IPluginRegistry* getPluginRegistry() {
return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
}
// A logger for create TensorRT infer builder. // A logger for create TensorRT infer builder.
class NaiveLogger : public nvinfer1::ILogger { class NaiveLogger : public nvinfer1::ILogger {
......
...@@ -33,53 +33,29 @@ namespace plugin { ...@@ -33,53 +33,29 @@ namespace plugin {
template <typename T> template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::initialize() { int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
int nb_emb = embs_.size();
std::vector<void *> ptr_vector(nb_emb);
std::vector<std::vector<half>> emb_fp16(nb_emb);
if (sizeof(T) == sizeof(float)) {
// FP32
for (int i = 0; i < nb_emb; ++i) {
ptr_vector[i] = embs_[i];
}
} else {
// FP16
for (int i = 0; i < nb_emb; ++i) {
auto emb_size = emb_sizes_[i];
auto &tmp = emb_fp16[i];
tmp.resize(emb_size);
for (int j = 0; j < emb_size; ++j) {
tmp[j] = static_cast<half>(embs_[i][j]);
}
ptr_vector[i] = tmp.data();
}
}
embs_gpu_.resize(embs_.size()); embs_gpu_.resize(embs_.size());
for (int i = 0; i < embs_.size(); i++) { for (int i = 0; i < embs_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]); if (embs_[i]) {
cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T), cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpyHostToDevice); cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
cudaMemcpyHostToDevice);
}
} }
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); if (bias_) {
cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpyHostToDevice); cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); cudaMemcpyHostToDevice);
cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), }
cudaMemcpyHostToDevice); if (scale_) {
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
return 0; cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
} cudaMemcpyHostToDevice);
}
template <typename T>
size_t EmbEltwiseLayernormPluginDynamic<T>::getSerializationSize() const {
return 0; return 0;
} }
template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::serialize(void *buffer) const {}
template <typename T> template <typename T>
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
......
...@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
hidden_size_(hidden_size), hidden_size_(hidden_size),
eps_(eps) {} eps_(eps) {}
EmbEltwiseLayernormPluginDynamic(void const* serialData, EmbEltwiseLayernormPluginDynamic(void const* serial_data,
size_t serialLength) {} size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
embs_gpu_.resize(emb_sizes_.size());
embs_.resize(emb_sizes_.size());
for (size_t i = 0; i < emb_sizes_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
cudaMemcpyHostToDevice);
reinterpret_cast<char const*&>(serial_data) +=
emb_sizes_[i] * sizeof(float);
serial_length -= emb_sizes_[i] * sizeof(float);
embs_[i] = nullptr;
}
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
cudaMemcpyHostToDevice);
bias_ = nullptr;
reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
serial_length -= bias_size_ * sizeof(float);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
cudaMemcpyHostToDevice);
scale_ = nullptr;
reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
serial_length -= scale_size_ * sizeof(float);
DeserializeValue(&serial_data, &serial_length, &hidden_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new EmbEltwiseLayernormPluginDynamic( return new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
...@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; int sum_num = 0;
sum_num += SerializedSize(emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) {
sum_num += emb_sizes_[i] * sizeof(float);
}
sum_num += SerializedSize(bias_size_);
sum_num += SerializedSize(scale_size_);
sum_num += (bias_size_ + scale_size_) * sizeof(float);
sum_num += SerializedSize(hidden_size_);
sum_num += SerializedSize(eps_);
// sum_num += SerializedSize(with_fp16_);
return sum_num;
}
void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) {
SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
}
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
SerializeValue(&buffer, hidden_size_);
SerializeValue(&buffer, eps_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
...@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
// data on devices // data on devices
float* bias_gpu_; float* bias_gpu_;
float* scale_gpu_; float* scale_gpu_;
std::vector<T*> embs_gpu_; std::vector<float*> embs_gpu_;
std::vector<int> emb_sizes_; std::vector<int> emb_sizes_;
int bias_size_; int bias_size_;
...@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int hidden_size_; int hidden_size_;
float eps_; float eps_;
}; };
class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
public:
EmbEltwiseLayernormPluginV2Creator() {}
const char* getPluginName() const override {
return "fused_embedding_eltwise_layernorm_plugin";
}
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
serial_length);
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
......
...@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, ...@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
// Dynamic Plugin below. // Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
size_t GeluPluginDynamic::getSerializationSize() const { return 0; }
void GeluPluginDynamic::serialize(void* buffer) const {}
nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
......
...@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT { ...@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
GeluPlugin(void const* serialData, size_t serialLength) { GeluPlugin(void const* serial_data, size_t serial_length) {
deserializeBase(serialData, serialLength); deserializeBase(serial_data, serial_length);
} }
~GeluPlugin() {} ~GeluPlugin() {}
...@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT { ...@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
bool supportsFormat(nvinfer1::DataType type, bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const override; nvinfer1::PluginFormat format) const override;
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) override; int nb_input_dims) override;
int enqueue(int batchSize, const void* const* inputs, void** outputs, int enqueue(int batch_size, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) override; void* workspace, cudaStream_t stream) override;
protected: protected:
...@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT { ...@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
class GeluPluginDynamic : public DynamicPluginTensorRT { class GeluPluginDynamic : public DynamicPluginTensorRT {
public: public:
GeluPluginDynamic() {} GeluPluginDynamic() {}
GeluPluginDynamic(void const* serialData, size_t serialLength) {} GeluPluginDynamic(void const* serial_data, size_t serial_length) {}
~GeluPluginDynamic() {} ~GeluPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
...@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { ...@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override { return 0; } int initialize() override { return 0; }
size_t getSerializationSize() const override; size_t getSerializationSize() const override { return 0; }
void serialize(void* buffer) const override; void serialize(void* buffer) const override {}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& exprBuilder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
}; };
class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
public:
GeluPluginV2Creator() {}
const char* getPluginName() const override { return "gelu_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new GeluPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -152,10 +152,6 @@ inline void TransposeQKV(const int batch, const int seq_len, ...@@ -152,10 +152,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
int QkvToContextPluginDynamic::initialize() { return 0; } int QkvToContextPluginDynamic::initialize() { return 0; }
size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; }
void QkvToContextPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { nvinfer1::IExprBuilder &expr_builder) {
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
...@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
scale_(scale), scale_(scale),
ban_fp16_(ban_fp16) {} ban_fp16_(ban_fp16) {}
QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {} QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &hidden_);
DeserializeValue(&serial_data, &serial_length, &head_number_);
DeserializeValue(&serial_data, &serial_length, &head_size_);
DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_, return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
scale_, ban_fp16_); scale_, ban_fp16_);
...@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; return SerializedSize(hidden_) + SerializedSize(head_number_) +
SerializedSize(head_size_) + SerializedSize(scale_) +
SerializedSize(ban_fp16_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, hidden_);
SerializeValue(&buffer, head_number_);
SerializeValue(&buffer, head_size_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, ban_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
...@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
...@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
float scale_; float scale_;
bool ban_fp16_; bool ban_fp16_;
}; };
class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
public:
QkvToContextPluginV2Creator() {}
const char* getPluginName() const override { return "qkv_to_context_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -32,18 +32,14 @@ namespace plugin { ...@@ -32,18 +32,14 @@ namespace plugin {
int SkipLayerNormPluginDynamic::initialize() { int SkipLayerNormPluginDynamic::initialize() {
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
return 0; return 0;
} }
size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { nvinfer1::IExprBuilder &expr_builder) {
......
...@@ -29,61 +29,84 @@ namespace plugin { ...@@ -29,61 +29,84 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size, explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
int scale_size, const float eps, int bias_size, int scale_size,
bool ban_fp16) const float eps, bool ban_fp16)
: bias_(bias), : bias_size_(bias_size),
scale_(scale),
bias_size_(bias_size),
scale_size_(scale_size), scale_size_(scale_size),
eps_(eps), eps_(eps),
ban_fp16_(ban_fp16) {} ban_fp16_(ban_fp16) {
SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {} bias_.resize(bias_size);
scale_.resize(scale_size);
std::copy(bias, bias + bias_size, bias_.data());
std::copy(scale, scale + scale_size, scale_.data());
}
SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &bias_);
DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_, return new SkipLayerNormPluginDynamic(
scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
} }
const char* getPluginType() const override { return "skip_layernorm_plugin"; } const char* getPluginType() const override { return "skip_layernorm_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(bias_size_) + SerializedSize(scale_size_) +
SerializedSize(eps_) + SerializedSize(eps_);
return ser_size;
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, bias_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, ban_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
private: private:
float* bias_; std::vector<float> bias_;
float* scale_; std::vector<float> scale_;
float* bias_gpu_; float* bias_gpu_;
float* scale_gpu_; float* scale_gpu_;
...@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
float eps_; float eps_;
bool ban_fp16_; bool ban_fp16_;
}; };
class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SkipLayerNormPluginV2Creator() {}
const char* getPluginName() const override { return "skip_layernorm_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -175,11 +175,24 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { ...@@ -175,11 +175,24 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
void serializeBase(void*& buffer) const; // NOLINT void serializeBase(void*& buffer) const; // NOLINT
private: private:
std::string name_space_{"paddle_trt"}; std::string name_space_;
std::string plugin_base_{"plugin_dynamic"}; std::string plugin_base_;
}; };
#endif #endif
template <typename T>
class TrtPluginRegistrarV2 {
public:
TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
private:
T creator;
};
#define REGISTER_TRT_PLUGIN_V2(name) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
plugin_registrar_##name {}
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size, ...@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
return details::Serializer<T>::Deserialize(buffer, buffer_size, value); return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
} }
template <typename T>
inline void SerializeCudaPointer(void** buffer, T* value, int size) {
cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost);
reinterpret_cast<char*&>(*buffer) += size * sizeof(T);
}
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -433,6 +433,25 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -433,6 +433,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
endif() endif()
set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle {
namespace inference {
void run(const AnalysisConfig& config, std::vector<float>* out_data) {
auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames();
int run_batch = 1;
const int run_seq_len = 128;
std::vector<int64_t> tmp_input;
std::vector<float> tmp_four_input;
tmp_input.reserve(run_batch * run_seq_len);
tmp_four_input.reserve(run_batch * run_seq_len);
int64_t i0[run_seq_len] = {
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321,
4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2,
75, 201, 340, 9, 14, 44, 486, 218, 1140, 279, 12043, 2};
int64_t i1[run_seq_len] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int64_t i2[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
// first input
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({run_batch, run_seq_len, 1});
input_t->copy_from_cpu(i0);
// second input
auto input_t2 = predictor->GetInputTensor(input_names[1]);
input_t2->Reshape({run_batch, run_seq_len, 1});
input_t2->copy_from_cpu(i1);
// third input.
auto input_t3 = predictor->GetInputTensor(input_names[2]);
input_t3->Reshape({run_batch, run_seq_len, 1});
input_t3->copy_from_cpu(i2);
auto input_t4 = predictor->GetInputTensor(input_names[3]);
input_t4->Reshape({run_batch, run_seq_len, 1});
input_t4->copy_from_cpu(i3);
ASSERT_TRUE(predictor->ZeroCopyRun());
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data->resize(out_num);
output_t->copy_to_cpu(out_data->data());
}
void trt_ernie(bool with_fp16, std::vector<float> result) {
AnalysisConfig config;
std::string model_dir = FLAGS_infer_model;
SetConfig(&config, model_dir, true /* use_gpu */);
config.SwitchUseFeedFetchOps(false);
int head_number = 12;
int batch = 1;
int min_seq_len = 1;
int max_seq_len = 128;
int opt_seq_len = 128;
std::vector<int> min_shape = {batch, min_seq_len, 1};
std::vector<int> max_shape = {batch, max_seq_len, 1};
std::vector<int> opt_shape = {batch, opt_seq_len, 1};
// Set the input's min, max, opt shape
std::map<std::string, std::vector<int>> min_input_shape = {
{"read_file_0.tmp_0", min_shape},
{"read_file_0.tmp_1", min_shape},
{"read_file_0.tmp_2", min_shape},
{"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"read_file_0.tmp_0", max_shape},
{"read_file_0.tmp_1", max_shape},
{"read_file_0.tmp_2", max_shape},
{"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"read_file_0.tmp_0", opt_shape},
{"read_file_0.tmp_1", opt_shape},
{"read_file_0.tmp_2", opt_shape},
{"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
auto precision = AnalysisConfig::Precision::kFloat32;
if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf;
}
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
std::vector<float> out_data;
run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
}
}
TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result);
}
TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16
std::vector<float> result = {0.598336, 0.219558, 0.182106};
trt_ernie(true, result);
#endif
}
} // namespace inference
} // namespace paddle
...@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
if (with_fp16) { if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf; precision = AnalysisConfig::Precision::kHalf;
} }
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true); config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape); opt_input_shape);
std::vector<float> out_data; std::vector<float> out_data;
......
...@@ -25,17 +25,23 @@ namespace operators { ...@@ -25,17 +25,23 @@ namespace operators {
using framework::Tensor; using framework::Tensor;
using platform::Transform; using platform::Transform;
#ifdef __NVCC__
template <typename T, typename UnaryOperation>
__global__ void ClipCudaKernel(const T* input, T* out, int num,
UnaryOperation op) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < num) {
out[idx] = op(input[idx]);
}
}
#endif
template <typename T> template <typename T>
class ClipFunctor { class ClipFunctor {
public: public:
explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {} explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x) const { HOSTDEVICE T operator()(const T& x) const {
if (x < min_) return x < min_ ? min_ : x > max_ ? max_ : x;
return min_;
else if (x > max_)
return max_;
else
return x;
} }
private: private:
...@@ -97,9 +103,20 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -97,9 +103,20 @@ class ClipKernel : public framework::OpKernel<T> {
T* out_data = out->mutable_data<T>(context.GetPlace()); T* out_data = out->mutable_data<T>(context.GetPlace());
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
int64_t numel = x->numel(); int64_t numel = x->numel();
Transform<DeviceContext> trans; if (platform::is_gpu_place(context.GetPlace())) {
trans(context.template device_context<DeviceContext>(), x_data, #ifdef __NVCC__
x_data + numel, out_data, ClipFunctor<T>(min, max)); int threads = 256;
int blocks = (numel + threads - 1) / threads;
ClipCudaKernel<T, ClipFunctor<T>><<<
blocks, threads, 0,
context.template device_context<platform::CUDADeviceContext>()
.stream()>>>(x_data, out_data, numel, ClipFunctor<T>(min, max));
#endif
} else {
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x_data,
x_data + numel, out_data, ClipFunctor<T>(min, max));
}
} else if (x_var->IsType<framework::SelectedRows>()) { } else if (x_var->IsType<framework::SelectedRows>()) {
auto* x = context.Input<framework::SelectedRows>("X"); auto* x = context.Input<framework::SelectedRows>("X");
auto* out = context.Output<framework::SelectedRows>("Out"); auto* out = context.Output<framework::SelectedRows>("Out");
......
...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/controlflow/compare_op.h" #include "paddle/fluid/operators/controlflow/compare_op.h"
#include <algorithm>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -85,14 +88,22 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -85,14 +88,22 @@ class CompareOp : public framework::OperatorWithKernel {
auto dim_x = context->GetInputDim("X"); auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Y"); auto dim_y = context->GetInputDim("Y");
PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(), if (context->GetInputDim("X") == context->GetInputDim("Y")) {
platform::errors::InvalidArgument( context->ShareDim("X", /*->*/ "Out");
"The size of dim_y should not be greater than " context->ShareLoD("X", /*->*/ "Out");
"dim_x's, but received dim_y: %d > dim_x: %d.\n", } else {
dim_y.size(), dim_x.size())); int max_dim = std::max(dim_x.size(), dim_y.size());
int axis = std::abs(dim_x.size() - dim_y.size());
context->SetOutputDim("Out", context->GetInputDim("X")); std::vector<int> x_dims_array(max_dim);
context->ShareLoD("X", "Out"); std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(),
y_dims_array.data(), out_dims_array.data(),
max_dim, axis);
context->SetOutputDim("Out", framework::make_ddim(out_dims_array));
// to do
context->ShareLoD("X", /*->*/ "Out");
}
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
......
...@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) { if (workspace_size > workspace_size_limit) {
has_got_workspace_size = false; workspace_size_limit = workspace_size;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
} }
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#endif #endif
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) { if (workspace_size > workspace_size_limit) {
workspace_size_limit = workspace_size;
has_got_workspace_size = false; has_got_workspace_size = false;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
} }
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
algo = (perf_results.get())[best_algo_idx].algo; algo = (perf_results.get())[best_algo_idx].algo;
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) { if (workspace_size > workspace_size_limit) {
has_got_workspace_size = false; workspace_size = workspace_size_limit;
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
}
if (!has_got_workspace_size) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
} }
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
......
...@@ -197,6 +197,40 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, ...@@ -197,6 +197,40 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
} }
#ifdef __NVCC__ #ifdef __NVCC__
template <typename Functor, typename T, typename OutType>
__global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre,
int n, int post, int total, Functor func) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int idx = tid / post % n;
if (tid < total) {
out[tid] = func(x[tid], y[idx]);
}
}
template <typename Functor, typename T, typename OutType>
void ComputeElementwiseCUDA(const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z,
int pre, int n, int post,
const platform::CUDADeviceContext &ctx,
Functor func, const bool is_xsize_larger = true) {
const T *x_data = x->data<T>();
const T *y_data = y->data<T>();
OutType *out_data = z->mutable_data<OutType>(ctx.GetPlace());
int numel = pre * n * post;
int threads = 256;
int blocks = (numel + threads - 1) / threads;
if (is_xsize_larger) {
ElementwiseKernel<Functor, T,
OutType><<<blocks, threads, 0, ctx.stream()>>>(
x_data, y_data, out_data, pre, n, post, numel, func);
} else {
ElementwiseKernel<Functor, T,
OutType><<<blocks, threads, 0, ctx.stream()>>>(
y_data, x_data, out_data, pre, n, post, numel, func);
}
}
template <typename Functor, typename T, typename OutType = T> template <typename Functor, typename T, typename OutType = T>
__global__ void CommonForwardBroadcastCUDAKernel( __global__ void CommonForwardBroadcastCUDAKernel(
const int *x_strides_array, const int *y_strides_array, const int *x_strides_array, const int *y_strides_array,
...@@ -1908,6 +1942,16 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, ...@@ -1908,6 +1942,16 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
return; return;
} }
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__
ComputeElementwiseCUDA<Functor, T, OutType>(
x, y, z, pre, n, post,
ctx.template device_context<platform::CUDADeviceContext>(), func,
is_xsize_larger);
#endif
return;
}
if (post == 1) { if (post == 1) {
functor.RunRowWise(n, pre); functor.RunRowWise(n, pre);
return; return;
......
...@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(transformed_input.dims()); auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (!exhaustive_search) { if (!exhaustive_search) {
int perf_count;
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
workspace_size_limit, &algo)); perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo;
VLOG(3) << "cuDNN forward algo " << algo; VLOG(3) << "cuDNN forward algo " << algo;
} else { } else {
std::function<cudnnConvolutionFwdAlgo_t()> search_func = std::function<cudnnConvolutionFwdAlgo_t()> search_func =
......
...@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> { ...@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
out_desc[i], cudnn_dtype, 4, out_dims[i].data(), out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
out_strides[i].data())); out_strides[i].data()));
int perf_count;
int best_algo_idx = 0;
size_t tmp_size = 0;
std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get()));
workspace_size_limit, &algo[i])); algo[i] = (perf_results.get())[best_algo_idx].algo;
size_t tmp_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
algo[i], &tmp_size)); algo[i], &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
cudnnActivationDescriptor_t cudnn_act_desc = cudnnActivationDescriptor_t cudnn_act_desc =
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <algorithm>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
......
...@@ -99,39 +99,25 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad, ...@@ -99,39 +99,25 @@ __global__ void PnormGradient(const T* x, const T* x_norm, const T* y_grad,
const float porder, const int pre, const float porder, const int pre,
const int axis_n, const int post, const T eps, const int axis_n, const int post, const T eps,
T* x_grad) { T* x_grad) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage_sum;
// dx = (x/pnorm_broadcast).pow(p-1) * norm_dy.broadcast * sign(x) // dx = (x/pnorm_broadcast).pow(p-1) * norm_dy.broadcast * sign(x)
int num = pre * post; int num = pre * post;
auto porder_grad = static_cast<T>(porder - 1.0f); auto porder_grad = static_cast<T>(porder - 1.0f);
for (int i = blockIdx.x; i < num; i += gridDim.x) { for (int i = blockIdx.x; i < num; i += gridDim.x) {
T sum = 0.0; __shared__ T pnorm_i;
__shared__ T row_sum; __shared__ T yout_i;
__shared__ T row_sqrt_norm;
__shared__ T row_norm;
auto base = (i / post) * post * axis_n + (i % post); auto base = (i / post) * post * axis_n + (i % post);
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
int index = base + j * post;
sum += x[index] * y_grad[index];
}
T reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
row_sum = reduce_result; pnorm_i = x_norm[i];
row_sqrt_norm = x_norm[i]; yout_i = y_grad[i];
row_norm = row_sqrt_norm * row_sqrt_norm;
} }
__syncthreads();
const T pnorm_i = x_norm[i]; __syncthreads();
const T yout_i = y_grad[i];
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
int index = base + j * post; int index = base + j * post;
const T x_ij = inline_abs(x[index]); const T x_ij = inline_abs(x[index]);
const T dy_ij = y_grad[index];
x_grad[index] = inline_pow(x_ij, porder_grad) / x_grad[index] = inline_pow(x_ij, porder_grad) /
(inline_pow(pnorm_i, porder_grad) + eps) * yout_i * (inline_pow(pnorm_i, porder_grad) + eps) * yout_i *
inline_sign(x[index]); inline_sign(x[index]);
......
...@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnSetTensorNdDescriptor); \ __macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \ __macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \ __macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \ __macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \ __macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \ __macro(cudnnCreateFilterDescriptor); \
...@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \ __macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \ __macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \ __macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \ __macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \ __macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \ __macro(cudnnGetRNNTrainingReserveSize); \
...@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 3000 #if CUDNN_VERSION >= 3000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize); __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
// APIs available after R3:
#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
// APIs available after R4: // APIs available after R4:
#if CUDNN_VERSION >= 4007 #if CUDNN_VERSION >= 4007
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
...@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize); __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 8000
#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,18 +13,62 @@ ...@@ -13,18 +13,62 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/dynload/tensorrt.h"
#include <string>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag tensorrt_dso_flag; std::once_flag tensorrt_dso_flag;
void *tensorrt_dso_handle; void* tensorrt_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP); TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
void* GetTensorRtHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string dso_name = "libnvinfer.dylib";
#elif defined(_WIN32)
std::string dso_name = "nvinfer.dll";
#else
std::string dso_name = "libnvinfer.so";
#endif
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
if (nullptr == dso_handle) {
auto error_msg =
"TensorRT dynamic library (%s) that Paddle depends on is not "
"configured correctly. (error code is %s)\n"
" Suggestions:\n"
" 1. Check if TensorRT "
"is installed correctly and its version is matched with paddlepaddle "
"you installed.\n"
" 2. Configure TensorRT dynamic library environment variables as "
"follows:\n"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
" - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
"[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
"impossible unless System Integrity Protection (SIP) is disabled.]";
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
std::cerr << string::Sprintf(error_msg, dso_name, errorno);
}
return dso_handle;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -27,6 +27,8 @@ namespace paddle { ...@@ -27,6 +27,8 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
void* GetTensorRtHandle();
extern std::once_flag tensorrt_dso_flag; extern std::once_flag tensorrt_dso_flag;
extern void* tensorrt_dso_handle; extern void* tensorrt_dso_handle;
...@@ -36,8 +38,7 @@ extern void* tensorrt_dso_handle; ...@@ -36,8 +38,7 @@ extern void* tensorrt_dso_handle;
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using tensorrt_func = decltype(&::__name); \ using tensorrt_func = decltype(&::__name); \
std::call_once(tensorrt_dso_flag, []() { \ std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \ tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
paddle::platform::dynload::GetTensorRtDsoHandle(); \
PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle, \ PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle, \
platform::errors::Unavailable( \ platform::errors::Unavailable( \
"Load tensorrt %s failed", #__name)); \ "Load tensorrt %s failed", #__name)); \
...@@ -53,7 +54,8 @@ extern void* tensorrt_dso_handle; ...@@ -53,7 +54,8 @@ extern void* tensorrt_dso_handle;
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \ __macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL); __macro(createInferRuntime_INTERNAL); \
__macro(getPluginRegistry);
TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
......
...@@ -958,6 +958,7 @@ function parallel_test() { ...@@ -958,6 +958,7 @@ function parallel_test() {
ut_total_startTime_s=`date +%s` ut_total_startTime_s=`date +%s`
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
pip install ${PADDLE_ROOT}/build/python/dist/*whl
if [ "$WITH_GPU" == "ON" ];then if [ "$WITH_GPU" == "ON" ];then
parallel_test_base_gpu parallel_test_base_gpu
else else
......
...@@ -604,6 +604,15 @@ class DistributedStrategy(object): ...@@ -604,6 +604,15 @@ class DistributedStrategy(object):
else: else:
print("WARNING: lars should have value of bool type") print("WARNING: lars should have value of bool type")
@property
def lars_configs(self):
return get_msg_dict(self.strategy.lars_configs)
@lars_configs.setter
def lars_configs(self, configs):
check_configs_key(self.strategy.lars_configs, configs, "lars_configs")
assign_configs_value(self.strategy.lars_configs, configs)
@property @property
def lamb(self): def lamb(self):
return self.strategy.lamb return self.strategy.lamb
......
...@@ -279,8 +279,11 @@ class Fleet(object): ...@@ -279,8 +279,11 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/Fleet # for more examples, please reference https://github.com/PaddlePaddle/Fleet
""" """
context = {}
# cache original feed forward program # cache original feed forward program
self.origin_main_program = loss.block.program self.origin_main_program = loss.block.program
context["origin_main_program"] = self.origin_main_program
context["loss"] = loss
if startup_program == None: if startup_program == None:
self.origin_startup_program = \ self.origin_startup_program = \
paddle.default_startup_program().clone(for_test=False) paddle.default_startup_program().clone(for_test=False)
...@@ -288,6 +291,8 @@ class Fleet(object): ...@@ -288,6 +291,8 @@ class Fleet(object):
else: else:
self.origin_startup_program = \ self.origin_startup_program = \
startup_program.clone(for_test=False) startup_program.clone(for_test=False)
context["origin_startup_program"] = startup_program
context["role_maker"] = self._role_maker
# compile time # compile time
distributed_optimizer_list = \ distributed_optimizer_list = \
...@@ -317,6 +322,9 @@ class Fleet(object): ...@@ -317,6 +322,9 @@ class Fleet(object):
valid_strategy = self.strategy_compiler._get_valid_strategy( valid_strategy = self.strategy_compiler._get_valid_strategy(
self.user_defined_strategy, can_not_apply_optimizer_list) self.user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = valid_strategy
self.valid_strategy = valid_strategy self.valid_strategy = valid_strategy
optimize_ops = [] optimize_ops = []
...@@ -334,6 +342,8 @@ class Fleet(object): ...@@ -334,6 +342,8 @@ class Fleet(object):
parameter_list=parameter_list, parameter_list=parameter_list,
no_grad_set=no_grad_set) no_grad_set=no_grad_set)
context["program_optimize_ops"] = optimize_ops
context["program_params_grads"] = params_grads
if graph_optimizer: if graph_optimizer:
optimize_ops, params_grads = graph_optimizer.minimize( optimize_ops, params_grads = graph_optimizer.minimize(
loss, loss,
...@@ -344,12 +354,13 @@ class Fleet(object): ...@@ -344,12 +354,13 @@ class Fleet(object):
# if a graph optimizer takes effect, mostly # if a graph optimizer takes effect, mostly
# optimizers_ops and params_grads are None # optimizers_ops and params_grads are None
# i.e. users can not modify current computation graph anymore # i.e. users can not modify current computation graph anymore
context["graph_optimize_ops"] = optimize_ops
context["graph_optimize_grads"] = params_grads
if self._runtime_handle is None: if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime( self._runtime_handle = RuntimeFactory()._create_runtime(context)
valid_strategy, self._role_maker, optimize_ops, params_grads)
if self._util is None: if self._util is None:
self._util = UtilFactory()._create_util( self._util = UtilFactory()._create_util(context)
valid_strategy, self._role_maker, optimize_ops, params_grads)
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -19,6 +19,7 @@ from ..meta_optimizers import GraphExecutionOptimizer ...@@ -19,6 +19,7 @@ from ..meta_optimizers import GraphExecutionOptimizer
from ..meta_optimizers import PipelineOptimizer from ..meta_optimizers import PipelineOptimizer
from ..meta_optimizers import LocalSGDOptimizer from ..meta_optimizers import LocalSGDOptimizer
from ..meta_optimizers import LarsOptimizer from ..meta_optimizers import LarsOptimizer
from ..meta_optimizers import DGCOptimizer
__all__ = ["MetaOptimizerFactory"] __all__ = ["MetaOptimizerFactory"]
...@@ -30,6 +31,7 @@ meta_optimizer_names = [ ...@@ -30,6 +31,7 @@ meta_optimizer_names = [
"PipelineOptimizer", "PipelineOptimizer",
"LocalSGDOptimizer", "LocalSGDOptimizer",
"LarsOptimizer", "LarsOptimizer",
"DGCOptimizer",
] ]
......
...@@ -18,10 +18,8 @@ class RuntimeFactory(object): ...@@ -18,10 +18,8 @@ class RuntimeFactory(object):
def __init__(self): def __init__(self):
pass pass
def _create_runtime(self, final_dist_strategy, role_maker, opt_ops, def _create_runtime(self, context):
params_grads): if context["role_maker"]._is_collective:
if role_maker._is_collective:
collective_runtime = CollectiveRuntime() collective_runtime = CollectiveRuntime()
collective_runtime._set_basic_info(final_dist_strategy, role_maker, collective_runtime._set_basic_info(context)
opt_ops, params_grads)
return collective_runtime return collective_runtime
...@@ -20,11 +20,10 @@ __all__ = ['UtilBase'] ...@@ -20,11 +20,10 @@ __all__ = ['UtilBase']
class UtilFactory(object): class UtilFactory(object):
def _create_util(self, dist_strategy, role_maker, optimize_ops, def _create_util(self, context):
params_grads):
util = UtilBase() util = UtilBase()
util._set_strategy(dist_strategy) util._set_strategy(context["valid_strategy"])
util._set_role_maker(role_maker) util._set_role_maker(context["role_maker"])
return util return util
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
from paddle.fleet.launch_utils import get_cluster, logger
def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
"""
args_node_ips, args_node_ip:string
"""
#you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips = os.getenv("PADDLE_TRAINERS")
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
node_ip = os.getenv("POD_IP")
assert node_ip is not None, "POD_IP should not be None"
node_rank = os.getenv("PADDLE_TRAINER_ID")
assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
node_ips = node_ips.split(",")
num_nodes = len(node_ips)
node_rank = int(node_rank)
if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
logger.warning(
"Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args_node_ips, node_ips))
started_port = args_port
print("num_nodes:", num_nodes)
if num_nodes > 1:
try:
paddle_port = int(os.getenv("PADDLE_PORT", ""))
paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
if paddle_port_num >= len(
selected_gpus) and paddle_port != args_port:
logger.warning("Use Cloud specified port:{}.".format(
paddle_port))
started_port = paddle_port
except Exception as e:
print(e)
pass
if started_port is None:
started_port = 6170
logger.debug("parsed from args:node_ips:{} \
node_ip:{} node_rank:{} started_port:{}"
.format(node_ips, node_ip, node_rank, started_port))
ports = [x for x in range(started_port, started_port + len(selected_gpus))]
cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
return cluster, cluster.pods[node_rank]
def use_paddlecloud():
node_ips = os.getenv("PADDLE_TRAINERS")
node_ip = os.getenv("POD_IP")
node_rank = os.getenv("PADDLE_TRAINER_ID")
if node_ips is None or node_ip is None or node_rank is None:
return False
else:
return True
def get_trainers_num():
return int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
paddle.distributed.launch is a module that spawns multiple distributed
process on each training node for gpu training and cpu training.
Usage:
In both of single node training or multiple node training, this module
launch a process on each of the given gpu card or cpu machine.
GPU training:
1. for single node training with all visible gpu cards:
fleetrun your_training_py (arg1 arg2 and all others)
2. for single node training with [0,4) cards
fleetrun --gpus="0,1,2,3" your_training_py (arg1 arg2 and all others)
3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
on 192.168.0.16:
fleetrun --ips="192.168.0.16,192.168.0.17" --node_ip=192.168.0.16 \
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
fleetrun --ips="192.168.0.16,192.168.0.17" \
--node_ip=192.168.0.17 \
your_training_py (arg1 arg2 and all others)
CPU training:
1. for single node training with multi servers and workers:
fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
with 2 servers and 4 workers.
on 192.168.0.16:
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
your_training_py (arg1 arg2 and all others)
"""
from __future__ import print_function
import sys
from sys import version
import subprocess
import os
import time
import six
import copy
from argparse import ArgumentParser, REMAINDER
import paddle
import paddle.fluid as fluid
from paddle.fleet.launch_utils import *
import paddle.fleet.cloud_utils as cloud_utils
def _print_arguments(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def _parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(
description='''start paddle training using multi-process mode.
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
''')
#Optional arguments for the launch helper
parser.add_argument(
"--ips",
type=str,
default="127.0.0.1",
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
parser.add_argument(
"--gpus",
type=str,
default=None,
help="It's for gpu training and the training process will run on the gpus,"
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
)
parser.add_argument(
"--servers", type=str, default="", help="User defined servers ip:port")
parser.add_argument(
"--workers", type=str, default="", help="User defined workers ip:port")
parser.add_argument(
"--worker_num", type=int, default=2, help="number of workers")
parser.add_argument(
"--server_num", type=int, default=2, help="number of servers")
parser.add_argument(
"--log_dir",
type=str,
help="The path for each process's log.If it's not set, the log will printed to default pipe."
)
#positional
parser.add_argument(
"training_script",
type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
#rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def get_cluster_from_args(args, gpus):
node_ips = [x.strip() for x in args.ips.split(',')]
if len(node_ips) == 1:
node_ip = node_ips[0]
else:
_, node_ip = get_host_name_ip()
# node_ip = args.node_ip
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips:{%s}" \
% (node_ip, node_ips)
node_rank = node_ips.index(node_ip)
logger.debug("parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
node_ips, node_ip, node_rank))
free_ports = None
if not cloud_utils.use_paddlecloud() and len(
node_ips) <= 1 and os.environ.get('FLAGS_START_PORT') is None:
free_ports = find_free_ports(len(gpus))
if free_ports is not None:
free_ports = list(free_ports)
else:
start_port = 6070
if os.environ.get('FLAGS_START_PORT') is not None:
start_port = os.environ.get('FLAGS_START_PORT')
free_ports = [x for x in range(start_port, start_port + len(gpus))]
return get_cluster(node_ips, node_ip, free_ports, gpus)
def get_gpus(gpus):
if gpus is None:
gpus_num = fluid.core.get_cuda_device_count()
gpus = [str(x) for x in range(0, gpus_num)]
else:
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "":
gpus = [x.strip() for x in gpus.split(',')]
else:
# change gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
# therefore gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in gpus.split(','):
assert x in cuda_visible_devices_list, "Can't find "\
"your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
gpus = [
cuda_visible_devices_list.index(x.strip())
for x in gpus.split(',')
]
return gpus
def launch_collective(args):
# parse arguments, used for cloud-single-machine and local
gpus = get_gpus(args.gpus)
trainers_num = cloud_utils.get_trainers_num()
logger.debug("parsed from args trainerss_num:{} gpus:{}".format(
trainers_num, gpus))
cluster = None
pod = None
if cloud_utils.use_paddlecloud() and trainers_num != 1:
cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus)
logger.info("get cluster from cloud:{}".format(cluster))
else:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args(args, gpus)
logger.info("get cluster from args:{}".format(cluster))
procs = start_local_trainers(
cluster,
pod,
training_script=args.training_script,
training_script_args=args.training_script_args,
log_dir=args.log_dir)
while True:
alive = watch_local_trainers(procs, cluster.trainers_nranks())
if not alive:
logger.info("Local procs complete, POD info:{}".format(pod))
break
time.sleep(3)
def launch_ps(args):
worker_num = args.worker_num
server_num = args.server_num
start_port = 6170
if os.environ.get('FLAGS_START_PORT') is not None:
start_port = os.environ.get('FLAGS_START_PORT')
default_env = os.environ.copy()
current_env = copy.copy(default_env)
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
procs = []
cmds = []
log_fns = []
ports = range(start_port, start_port + server_num, 1)
default_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
user_endpoints = ""
if args.servers == "":
user_endpoints = default_endpoints
else:
user_endpoints = args.servers
user_endpoints_ips = [x.split(":")[0] for x in user_endpoints.split(",")]
user_endpoints_port = [x.split(":")[1] for x in user_endpoints.split(",")]
for i in range(server_num):
current_env.update({
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
"PADDLE_PORT": user_endpoints_port[i],
"TRAINING_ROLE": "PSERVER",
"PADDLE_TRAINERS_NUM": str(worker_num),
"POD_IP": user_endpoints_ips[i]
})
cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args
cmds.append(cmd)
if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/serverlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc)
for i in range(worker_num):
current_env.update({
"PADDLE_PSERVERS_IP_PORT_LIST": user_endpoints,
"PADDLE_TRAINERS_NUM": str(worker_num),
"TRAINING_ROLE": "TRAINER",
"PADDLE_TRAINER_ID": str(i)
})
cmd = [sys.executable, "-u", args.training_script
] + args.training_script_args
cmds.append(cmd)
if args.log_dir is not None:
os.system("mkdir -p {}".format(args.log_dir))
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
log_fns.append(fn)
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
procs.append(proc)
# only wait worker to finish here
for i, proc in enumerate(procs):
if i < server_num:
continue
procs[i].wait()
if len(log_fns) > 0:
log_fns[i].close()
print("all workers exit, going to finish parameter server", file=sys.stderr)
for i in range(server_num):
if len(log_fns) > 0:
log_fns[i].close()
procs[i].terminate()
print("all parameter server are killed", file=sys.stderr)
def launch():
args = _parse_args()
logger = get_logger()
_print_arguments(args)
ps_args = ['--worker_num', '--server_num', '--servers', '--workers']
collective_args = ['--ips', '--gpus']
has_ps_args = [
ps_arg for ps_arg in ps_args if ps_arg in " ".join(sys.argv[1:-1])
]
has_collective_args = [
co_arg for co_arg in collective_args
if co_arg in " ".join(sys.argv[1:-1])
]
if len(has_ps_args) > 0 or fluid.core.get_cuda_device_count() == 0:
logger.info("Run cpu parameter-sever mode.")
launch_ps(args)
elif len(has_collective_args) > 0:
logger.info("Run gpu collective mode.")
launch_collective(args)
else:
logger.warning(
"Not found distinct args. Default use gpu collective mode")
launch_collective(args)
if __name__ == "__main__":
launch()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import logging
import socket
import time
import os
import signal
import copy
import sys
import subprocess
from contextlib import closing
import socket
logger = logging.getLogger("root")
logger.propagate = False
class Cluster(object):
def __init__(self, hdfs):
self.job_server = None
self.pods = []
self.hdfs = None
self.job_stage_flag = None
def __str__(self):
return "job_server:{} pods:{} job_stage_flag:{} hdfs:{}".format(
self.job_server, [str(pod) for pod in self.pods],
self.job_stage_flag, self.hdfs)
def __eq__(self, cluster):
if len(self.pods) != len(cluster.pods):
return False
for a, b in zip(self.pods, cluster.pods):
if a != b:
return False
if self.job_stage_flag != cluster.job_stage_flag:
return False
return True
def __ne__(self, cluster):
return not self.__eq__(cluster)
def update_pods(cluster):
self.pods = copy.copy(cluster.pods)
def trainers_nranks(self):
return len(self.trainers_endpoints())
def pods_nranks(self):
return len(self.pods)
def trainers_endpoints(self):
r = []
for pod in self.pods:
for t in pod.trainers:
r.append(t.endpoint)
return r
def pods_endpoints(self):
r = []
for pod in self.pods:
ep = "{}:{}".format(pod.addr, pod.port)
assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format(
ep)
r.append(ep)
return r
def get_pod_by_id(self, pod_id):
for pod in self.pods:
if str(pod_id) == str(pod.id):
return pod
return None
class JobServer(object):
def __init__(self):
self.endpoint = None
def __str__(self):
return "{}".format(self.endpoint)
def __eq__(self, j):
return self.endpint == j.endpoint
def __ne__(self, j):
return not self == j
class Trainer(object):
def __init__(self):
self.gpus = []
self.endpoint = None
self.rank = None
def __str__(self):
return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint,
self.rank)
def __eq__(self, t):
if len(self.gpus) != len(t.gpus):
return False
if self.endpoint != t.endpoint or \
self.rank != t.rank:
return False
for a, b in zip(self.gpus, t.gpus):
if a != b:
return False
return True
def __ne__(self, t):
return not self == t
def rank(self):
return self.rank
class Pod(object):
def __init__(self):
self.rank = None
self.id = None
self.addr = None
self.port = None
self.trainers = []
self.gpus = []
def __str__(self):
return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}".format(
self.rank, self.id, self.addr, self.port, self.gpus,
[str(t) for t in self.trainers])
def __eq__(self, pod):
if self.rank != pod.rank or \
self.id != pod.id or \
self.addr != pod.addr or \
self.port != pod.port:
logger.debug("pod {} != pod".format(self, pod))
return False
if len(self.trainers) != len(pod.trainers):
logger.debug("trainers {} != {}".format(self.trainers,
pod.trainers))
return False
for i in range(len(self.trainers)):
if self.trainers[i] != pod.trainers[i]:
logger.debug("trainer {} != {}".format(self.trainers[i],
pod.trainers[i]))
return False
return True
def __ne__(self, pod):
return not self == pod
def parse_response(self, res_pods):
pass
def rank(self):
return self.rank
def get_visible_gpus(self):
r = ""
for g in self.gpus:
r += "{},".format(g)
assert r != "", "this pod {} can't see any gpus".format(self)
r = r[:-1]
return r
def get_logger(log_level=20, name="root"):
logger = logging.getLogger(name)
logger.setLevel(log_level)
log_handler = logging.StreamHandler()
log_format = logging.Formatter(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
log_handler.setFormatter(log_format)
logger.addHandler(log_handler)
return logger
def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
assert type(paddle_ports) is list, "paddle_ports must be list"
cluster = Cluster(hdfs=None)
trainer_rank = 0
for node_rank, ip in enumerate(node_ips):
pod = Pod()
pod.rank = node_rank
pod.addr = ip
for i in range(len(selected_gpus)):
trainer = Trainer()
trainer.gpus.append(selected_gpus[i])
trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
trainer.rank = trainer_rank
trainer_rank += 1
pod.trainers.append(trainer)
cluster.pods.append(pod)
pod_rank = node_ips.index(node_ip)
return cluster, cluster.pods[pod_rank]
def terminate_local_procs(procs):
for p in procs:
if p.proc.poll() is None:
p.proc.terminate()
p.log_fn.close()
logger.debug("terminate process id:{}".format(p.proc.pid))
#wait all process terminiated
time.sleep(3)
for step in range(0, 50):
alive = False
for p in procs:
if p.proc.poll() is None: # not termniate
os.kill(p.proc.pid, signal.SIGKILL)
alive = True
if not alive:
logger.info("terminate all the procs")
return
time.sleep(3)
logger.fatal("can't kill all process and exit")
exit(1)
def get_host_name_ip():
try:
host_name = socket.gethostname()
host_ip = socket.gethostbyname(host_name)
return host_name, host_ip
except:
return None
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def find_free_ports(num):
def __free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
port_set = set()
step = 0
while True:
port = __free_port()
if port not in port_set:
port_set.add(port)
if len(port_set) >= num:
return port_set
step += 1
if step > 100:
print(
"can't find avilable port and use the specified static port now!"
)
return None
return None
class TrainerProc(object):
def __init__(self):
self.proc = None
self.log_fn = None
self.log_offset = None
self.rank = None
self.local_rank = None
self.cmd = None
def start_local_trainers(cluster,
pod,
training_script,
training_script_args,
log_dir=None):
current_env = copy.copy(os.environ.copy())
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
procs = []
for idx, t in enumerate(pod.trainers):
proc_env = {
"FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
"PADDLE_TRAINER_ID": "%d" % t.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
}
current_env.update(proc_env)
logger.debug("trainer proc env:{}".format(current_env))
cmd = [sys.executable, "-u", training_script] + training_script_args
logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
fn = None
if log_dir is not None:
os.system("mkdir -p {}".format(log_dir))
fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:
proc = subprocess.Popen(cmd, env=current_env)
tp = TrainerProc()
tp.proc = proc
tp.rank = t.rank
tp.local_rank = idx
tp.log_fn = fn
tp.log_offset = fn.tell() if fn else None
tp.cmd = cmd
procs.append(tp)
return procs
def pull_worker_log(tp):
if tp.log_fn:
with open(tp.log_fn.name, 'r') as fin:
fin.seek(tp.log_offset, 0)
for line in fin:
try:
sys.stdout.write(line)
except UnicodeEncodeError:
sys.stdout.write(
'UnicodeEncodeError occurs at this line. '
'Please refer to the original log file "%s"\n' %
tp.log_fn.name)
tp.log_offset = fin.tell()
def watch_local_trainers(procs, nranks):
try:
error = False
error_rank = []
# wait all process finish or one error
alive = False
for p in procs:
if p.log_fn and p.local_rank == 0:
pull_worker_log(p)
ret = p.proc.poll()
if ret is None:
alive = True
elif ret != 0:
error = True
error_rank.append(p.rank)
if error:
terminate_local_procs(procs)
exit(1)
except KeyboardInterrupt:
logger.warning("KeyboardInterrupt, exit")
terminate_local_procs(procs)
raise
except SystemExit:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
except:
logger.error(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
format(nranks, error_rank))
terminate_local_procs(procs)
raise
return alive
...@@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer ...@@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer
from .pipeline_optimizer import PipelineOptimizer from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer from .localsgd_optimizer import LocalSGDOptimizer
from .lars_optimizer import LarsOptimizer from .lars_optimizer import LarsOptimizer
from .dgc_optimizer import DGCOptimizer
__all__ = [ __all__ = [
'AMPOptimizer', 'AMPOptimizer',
...@@ -26,4 +27,5 @@ __all__ = [ ...@@ -26,4 +27,5 @@ __all__ = [
'PipelineOptimizer', 'PipelineOptimizer',
'LocalSGDOptimizer', 'LocalSGDOptimizer',
'LarsOptimizer', 'LarsOptimizer',
'DGCOptimizer',
] ]
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
from .meta_optimizer_base import MetaOptimizerBase
import logging
__all__ = ["DGCOptimizer"]
class DGCOptimizer(MetaOptimizerBase):
def __init__(self, optimizer):
super(DGCOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
self.dgc_opt = None
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = []
def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
user_defined_strategy):
super(DGCOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy)
opt = self.inner_opt
if not isinstance(opt, Momentum):
return
configs = self.user_defined_strategy.dgc_configs
if len(configs['sparsity']) == 0:
# default is [0.999]
configs['sparsity'] = [0.999]
self.dgc_opt = DGCMomentumOptimizer(
learning_rate=opt._learning_rate,
momentum=opt._momentum,
rampup_begin_step=configs['rampup_begin_step'],
rampup_step=configs['rampup_step'],
sparsity=configs['sparsity'],
parameter_list=opt._parameter_list,
use_nesterov=opt._use_nesterov,
num_trainers=self.role_maker.worker_num(),
regularization=opt.regularization,
grad_clip=opt._grad_clip,
name=opt._name)
def _can_apply(self):
if self.user_defined_strategy.dgc:
if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer")
return False
if self.role_maker.worker_num() <= 1:
logging.warn("dgc only works on multi cards")
return False
return True
return False
def _disable_strategy(self, dist_strategy):
dist_strategy.dgc = False
dist_strategy.dgc_configs = {
'rampup_begin_step': 0,
'rampup_step': 1,
'sparsity': [0.999]
}
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None):
return self.dgc_opt.backward(loss, startup_program, parameter_list,
no_grad_set, callbacks)
def minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
optimize_ops, params_grads = \
self.dgc_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
return optimize_ops, params_grads
...@@ -40,7 +40,8 @@ class MetaOptimizerBase(object): ...@@ -40,7 +40,8 @@ class MetaOptimizerBase(object):
return True return True
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
raise NotImplementedError("you should implement disable strategy") raise NotImplementedError("you should implement disable strategy in {}".
format(type(self).__name__))
def minimize_impl(self, def minimize_impl(self,
loss, loss,
......
...@@ -19,11 +19,8 @@ class RuntimeBase(object): ...@@ -19,11 +19,8 @@ class RuntimeBase(object):
def __init__(self): def __init__(self):
pass pass
def _set_basic_info(self, loss, role_maker, optimizer, strategy): def _set_basic_info(self, context):
self.loss = loss self.context = context
self.role_maker = role_maker
self.optimizer = optimizer
self.strategy = strategy
def _run_worker(self): def _run_worker(self):
pass pass
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import functools
import sys
__all__ = ['deprecated']
def deprecated(since, instead, extra_message=""):
def decorator(func):
err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
func.__name__, since, instead)
if len(extra_message) != 0:
err_msg += "\n"
err_msg += extra_message
@functools.wraps(func)
def wrapper(*args, **kwargs):
print(err_msg, file=sys.stderr)
return func(*args, **kwargs)
wrapper.__doc__ += "\n "
wrapper.__doc__ += err_msg
return wrapper
return decorator
...@@ -36,7 +36,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager ...@@ -36,7 +36,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle.fluid.dygraph.base import param_guard from paddle.fluid.dygraph.base import param_guard
from paddle.fluid.data_feeder import check_type from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
from paddle.fluid.annotations import deprecated
__all__ = ['ProgramTranslator', 'convert_to_static'] __all__ = ['ProgramTranslator', 'convert_to_static']
......
...@@ -129,6 +129,45 @@ class Layer(core.Layer): ...@@ -129,6 +129,45 @@ class Layer(core.Layer):
for layer in self.sublayers(): for layer in self.sublayers():
layer.eval() layer.eval()
def apply(self, fn):
"""
Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
as well as self. Typical use includes initializing the parameters of a model.
Parameters:
fn (function): a function to be applied to each sublayer
Returns:
Layer: self
Example::
.. code-block:: python
import paddle
import paddle.nn as nn
paddle.enable_imperative()
net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
def init_weights(layer):
if type(layer) == nn.Linear:
print('before init weight:', layer.weight.numpy())
new_weight = paddle.fill_constant(layer.weight.shape, layer.weight.dtype, value=0.9)
layer.weight.set_value(new_weight)
print('after init weight:', layer.weight.numpy())
net.apply(init_weights)
print(net.state_dict())
"""
for layer in self.sublayers():
layer.apply(fn)
fn(self)
return self
def full_name(self): def full_name(self):
"""Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__ """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
......
...@@ -20,12 +20,12 @@ from __future__ import print_function ...@@ -20,12 +20,12 @@ from __future__ import print_function
from .layer_function_generator import autodoc from .layer_function_generator import autodoc
from ..framework import unique_name from ..framework import unique_name
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..annotations import deprecated from paddle.utils import deprecated
__all__ = [] __all__ = []
@deprecated(since='0.15.0', instead="ParallelExecutor") @deprecated(since='0.15.0', update_to="paddle.fluid.ParallelExecutor")
@autodoc() @autodoc()
def get_places(device_count=None, device_type=None): def get_places(device_count=None, device_type=None):
helper = LayerHelper('get_places', **locals()) helper = LayerHelper('get_places', **locals())
......
...@@ -37,6 +37,7 @@ from functools import reduce ...@@ -37,6 +37,7 @@ from functools import reduce
from .. import core from .. import core
from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
import paddle import paddle
from paddle.utils import deprecated
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -11614,6 +11615,7 @@ Examples: ...@@ -11614,6 +11615,7 @@ Examples:
return _elementwise_op(LayerHelper('elementwise_sub', **locals())) return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
@deprecated(since="2.0.0", update_to="paddle.multiply")
def elementwise_mul(x, y, axis=-1, act=None, name=None): def elementwise_mul(x, y, axis=-1, act=None, name=None):
""" """
:alias_main: paddle.elementwise_mul :alias_main: paddle.elementwise_mul
......
...@@ -47,9 +47,8 @@ __all__ = [ ...@@ -47,9 +47,8 @@ __all__ = [
'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer', 'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer',
'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta',
'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum', 'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum',
'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer', 'LarsMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage',
'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', 'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer'
'RecomputeOptimizer'
] ]
......
...@@ -27,6 +27,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_async) ...@@ -27,6 +27,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo) list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async) list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync) list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint) list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
...@@ -38,6 +39,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) ...@@ -38,6 +39,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function) list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor) list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
...@@ -387,6 +389,7 @@ if(WITH_DISTRIBUTE) ...@@ -387,6 +389,7 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_dgc_op MODULES test_dgc_op) py_test_modules(test_dgc_op MODULES test_dgc_op)
py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op) py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer) py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
py_test_modules(test_fleet_dgc_meta_optimizer MODULES test_fleet_dgc_meta_optimizer)
else() else()
# if not with dgc, must close all dgc tests # if not with dgc, must close all dgc tests
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
...@@ -399,6 +402,7 @@ if(WITH_DISTRIBUTE) ...@@ -399,6 +402,7 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint) py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
endif() endif()
bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
bash_test_modules(test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
set(dist_ut_port 20001) set(dist_ut_port 20001)
foreach(TEST_OP ${DIST_TEST_OPS}) foreach(TEST_OP ${DIST_TEST_OPS})
......
...@@ -18,6 +18,7 @@ import unittest ...@@ -18,6 +18,7 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
from paddle.fluid.dygraph.io import VARIABLE_FILENAME
from bert_dygraph_model import PretrainModelLayer from bert_dygraph_model import PretrainModelLayer
from bert_utils import get_bert_config, get_feed_data_reader from bert_utils import get_bert_config, get_feed_data_reader
...@@ -28,9 +29,11 @@ place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace( ...@@ -28,9 +29,11 @@ place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
SEED = 2020 SEED = 2020
STEP_NUM = 10 STEP_NUM = 10
PRINT_STEP = 2 PRINT_STEP = 2
MODEL_SAVE_PATH = "./bert.inference.model"
DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
def train(bert_config, data_reader): def train(bert_config, data_reader, to_static):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
fluid.default_main_program().random_seed = SEED fluid.default_main_program().random_seed = SEED
fluid.default_startup_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED
...@@ -79,18 +82,74 @@ def train(bert_config, data_reader): ...@@ -79,18 +82,74 @@ def train(bert_config, data_reader):
step_idx += 1 step_idx += 1
if step_idx == STEP_NUM: if step_idx == STEP_NUM:
if to_static:
fluid.dygraph.jit.save(bert, MODEL_SAVE_PATH)
else:
fluid.dygraph.save_dygraph(bert.state_dict(),
DY_STATE_DICT_SAVE_PATH)
break break
return loss, ppl return loss, ppl
def train_dygraph(bert_config, data_reader): def train_dygraph(bert_config, data_reader):
program_translator.enable(False) program_translator.enable(False)
return train(bert_config, data_reader) return train(bert_config, data_reader, False)
def train_static(bert_config, data_reader): def train_static(bert_config, data_reader):
program_translator.enable(True) program_translator.enable(True)
return train(bert_config, data_reader) return train(bert_config, data_reader, True)
def predict_static(data):
exe = fluid.Executor(place)
# load inference model
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(
MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
pred_res = exe.run(inference_program,
feed=dict(zip(feed_target_names, data)),
fetch_list=fetch_targets)
return pred_res
def predict_dygraph(bert_config, data):
program_translator.enable(False)
with fluid.dygraph.guard(place):
bert = PretrainModelLayer(
config=bert_config, weight_sharing=False, use_fp16=False)
model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
bert.set_dict(model_dict)
bert.eval()
input_vars = [fluid.dygraph.to_variable(x) for x in data]
src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
pred_res = bert(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
input_mask=input_mask,
mask_label=mask_label,
mask_pos=mask_pos,
labels=labels)
pred_res = [var.numpy() for var in pred_res]
return pred_res
def predict_dygraph_jit(data):
with fluid.dygraph.guard(place):
bert = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
bert.eval()
src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
mask_pos, labels)
pred_res = [var.numpy() for var in pred_res]
return pred_res
class TestBert(unittest.TestCase): class TestBert(unittest.TestCase):
...@@ -104,14 +163,36 @@ class TestBert(unittest.TestCase): ...@@ -104,14 +163,36 @@ class TestBert(unittest.TestCase):
dygraph_loss, dygraph_ppl = train_dygraph(self.bert_config, dygraph_loss, dygraph_ppl = train_dygraph(self.bert_config,
self.data_reader) self.data_reader)
self.assertTrue( self.assertTrue(
np.allclose(static_loss, static_loss), np.allclose(static_loss, dygraph_loss),
msg="static_loss: {} \n static_loss: {}".format(static_loss, msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
dygraph_loss)) dygraph_loss))
self.assertTrue( self.assertTrue(
np.allclose(static_ppl, dygraph_ppl), np.allclose(static_ppl, dygraph_ppl),
msg="static_ppl: {} \n dygraph_ppl: {}".format(static_ppl, msg="static_ppl: {} \n dygraph_ppl: {}".format(static_ppl,
dygraph_ppl)) dygraph_ppl))
self.verify_predict()
def verify_predict(self):
for data in self.data_reader.data_generator()():
dygraph_pred_res = predict_dygraph(self.bert_config, data)
static_pred_res = predict_static(data)
dygraph_jit_pred_res = predict_dygraph_jit(data)
for dy_res, st_res, dy_jit_res in zip(
dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
self.assertTrue(
np.allclose(st_res, dy_res),
"dygraph_res: {},\n static_res: {}".format(
dy_res[~np.isclose(st_res, dy_res)],
st_res[~np.isclose(st_res, dy_res)]))
self.assertTrue(
np.allclose(st_res, dy_jit_res),
"dygraph_jit_res: {},\n static_res: {}".format(
dy_jit_res[~np.isclose(st_res, dy_jit_res)],
st_res[~np.isclose(st_res, dy_jit_res)]))
break
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -692,13 +692,20 @@ class TestTrain(unittest.TestCase): ...@@ -692,13 +692,20 @@ class TestTrain(unittest.TestCase):
video_data = np.array([item[0] for item in data]).astype(DATATYPE) video_data = np.array([item[0] for item in data]).astype(DATATYPE)
static_pred_res = self.predict_static(video_data) static_pred_res = self.predict_static(video_data)
dygraph_pred_res = self.predict_dygraph(video_data) dygraph_pred_res = self.predict_dygraph(video_data)
dygraph_jit_pred_res = self.predict_dygraph_jit(video_data)
for dy_res, st_res in zip(dygraph_pred_res, static_pred_res): for dy_res, st_res, dy_jit_res in zip(
dygraph_pred_res, static_pred_res, dygraph_jit_pred_res):
self.assertTrue( self.assertTrue(
np.allclose(st_res, dy_res), np.allclose(st_res, dy_res),
"dygraph_res: {},\n static_res: {}".format( "dygraph_res: {},\n static_res: {}".format(
dy_res[~np.isclose(st_res, dy_res)], dy_res[~np.isclose(st_res, dy_res)],
st_res[~np.isclose(st_res, dy_res)])) st_res[~np.isclose(st_res, dy_res)]))
self.assertTrue(
np.allclose(st_res, dy_jit_res),
"dygraph_jit_res: {},\n static_res: {}".format(
dy_jit_res[~np.isclose(st_res, dy_jit_res)],
st_res[~np.isclose(st_res, dy_jit_res)]))
break break
def predict_dygraph(self, data): def predict_dygraph(self, data):
...@@ -731,6 +738,17 @@ class TestTrain(unittest.TestCase): ...@@ -731,6 +738,17 @@ class TestTrain(unittest.TestCase):
return pred_res return pred_res
def predict_dygraph_jit(self, data):
with fluid.dygraph.guard(self.place):
bmn = fluid.dygraph.jit.load(self.args.infer_dir)
bmn.eval()
x = to_variable(data)
pred_res = bmn(x)
pred_res = [var.numpy() for var in pred_res]
return pred_res
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -535,9 +535,14 @@ class TestLACModel(unittest.TestCase): ...@@ -535,9 +535,14 @@ class TestLACModel(unittest.TestCase):
batch = [np.vstack(var) for var in zip(*batch)] batch = [np.vstack(var) for var in zip(*batch)]
dy_pre = self.predict_dygraph(batch) dy_pre = self.predict_dygraph(batch)
st_pre = self.predict_static(batch) st_pre = self.predict_static(batch)
dy_jit_pre = self.predict_dygraph_jit(batch)
self.assertTrue( self.assertTrue(
np.allclose(dy_pre, st_pre), np.allclose(dy_pre, st_pre),
msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre)) msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
self.assertTrue(
np.allclose(dy_jit_pre, st_pre),
msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
st_pre))
def predict_dygraph(self, batch): def predict_dygraph(self, batch):
words, targets, length = batch words, targets, length = batch
...@@ -576,6 +581,16 @@ class TestLACModel(unittest.TestCase): ...@@ -576,6 +581,16 @@ class TestLACModel(unittest.TestCase):
fetch_list=fetch_targets) fetch_list=fetch_targets)
return pred_res[0] return pred_res[0]
def predict_dygraph_jit(self, batch):
words, targets, length = batch
with fluid.dygraph.guard(self.place):
model = fluid.dygraph.jit.load(self.args.model_save_dir)
model.eval()
pred_res = model(to_variable(words), to_variable(length))
return pred_res.numpy()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -19,6 +19,7 @@ from paddle.fluid.initializer import MSRA ...@@ -19,6 +19,7 @@ from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph import declarative, ProgramTranslator from paddle.fluid.dygraph import declarative, ProgramTranslator
from paddle.fluid.dygraph.io import VARIABLE_FILENAME
import unittest import unittest
...@@ -433,14 +434,15 @@ class Args(object): ...@@ -433,14 +434,15 @@ class Args(object):
class_dim = 50 class_dim = 50
print_step = 1 print_step = 1
train_step = 10 train_step = 10
place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
model_save_path = model + ".inference.model"
dy_state_dict_save_path = model + ".dygraph"
def train_mobilenet(args, to_static): def train_mobilenet(args, to_static):
program_translator.enable(to_static) program_translator.enable(to_static)
with fluid.dygraph.guard(args.place):
place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
with fluid.dygraph.guard(place):
np.random.seed(SEED) np.random.seed(SEED)
fluid.default_startup_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED
...@@ -461,7 +463,7 @@ def train_mobilenet(args, to_static): ...@@ -461,7 +463,7 @@ def train_mobilenet(args, to_static):
# 3. reader # 3. reader
train_reader = fake_data_reader(args.batch_size, args.class_dim) train_reader = fake_data_reader(args.batch_size, args.class_dim)
train_data_loader = fluid.io.DataLoader.from_generator(capacity=16) train_data_loader = fluid.io.DataLoader.from_generator(capacity=16)
train_data_loader.set_sample_list_generator(train_reader, place) train_data_loader.set_sample_list_generator(train_reader)
# 4. train loop # 4. train loop
loss_data = [] loss_data = []
...@@ -498,17 +500,64 @@ def train_mobilenet(args, to_static): ...@@ -498,17 +500,64 @@ def train_mobilenet(args, to_static):
batch_id += 1 batch_id += 1
t_last = time.time() t_last = time.time()
if batch_id > args.train_step: if batch_id > args.train_step:
if to_static:
fluid.dygraph.jit.save(net, args.model_save_path)
else:
fluid.dygraph.save_dygraph(net.state_dict(),
args.dy_state_dict_save_path)
break break
return np.array(loss_data) return np.array(loss_data)
def predict_static(args, data):
exe = fluid.Executor(args.place)
# load inference model
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(
args.model_save_path, executor=exe, params_filename=VARIABLE_FILENAME)
pred_res = exe.run(inference_program,
feed={feed_target_names[0]: data},
fetch_list=fetch_targets)
return pred_res[0]
def predict_dygraph(args, data):
program_translator.enable(False)
with fluid.dygraph.guard(args.place):
if args.model == "MobileNetV1":
model = MobileNetV1(class_dim=args.class_dim, scale=1.0)
elif args.model == "MobileNetV2":
model = MobileNetV2(class_dim=args.class_dim, scale=1.0)
# load dygraph trained parameters
model_dict, _ = fluid.load_dygraph(args.dy_state_dict_save_path)
model.set_dict(model_dict)
model.eval()
pred_res = model(fluid.dygraph.to_variable(data))
return pred_res.numpy()
def predict_dygraph_jit(args, data):
with fluid.dygraph.guard(args.place):
model = fluid.dygraph.jit.load(args.model_save_path)
model.eval()
pred_res = model(data)
return pred_res.numpy()
class TestMobileNet(unittest.TestCase): class TestMobileNet(unittest.TestCase):
def setUp(self): def setUp(self):
self.args = Args() self.args = Args()
def train(self, model_name, to_static): def train(self, model_name, to_static):
self.args.model = model_name self.args.model = model_name
self.args.model_save_path = model_name + ".inference.model"
self.args.dy_state_dict_save_path = model_name + ".dygraph"
out = train_mobilenet(self.args, to_static) out = train_mobilenet(self.args, to_static)
return out return out
...@@ -519,12 +568,36 @@ class TestMobileNet(unittest.TestCase): ...@@ -519,12 +568,36 @@ class TestMobileNet(unittest.TestCase):
np.allclose(dy_out, st_out), np.allclose(dy_out, st_out),
msg="dy_out: {}, st_out: {}".format(dy_out, st_out)) msg="dy_out: {}, st_out: {}".format(dy_out, st_out))
def test_mobileNet(self): def assert_same_predict(self, model_name):
self.args.model = model_name
self.args.model_save_path = model_name + ".inference.model"
self.args.dy_state_dict_save_path = model_name + ".dygraph"
local_random = np.random.RandomState(SEED)
image = local_random.random_sample([1, 3, 224, 224]).astype('float32')
dy_pre = predict_dygraph(self.args, image)
st_pre = predict_static(self.args, image)
dy_jit_pre = predict_dygraph_jit(self.args, image)
self.assertTrue(
np.allclose(dy_pre, st_pre),
msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
self.assertTrue(
np.allclose(dy_jit_pre, st_pre),
msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
def test_mobile_net(self):
# MobileNet-V1 # MobileNet-V1
self.assert_same_loss("MobileNetV1") self.assert_same_loss("MobileNetV1")
# MobileNet-V2 # MobileNet-V2
self.assert_same_loss("MobileNetV2") self.assert_same_loss("MobileNetV2")
self.verify_predict()
def verify_predict(self):
# MobileNet-V1
self.assert_same_predict("MobileNetV1")
# MobileNet-V2
self.assert_same_predict("MobileNetV2")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -22,39 +22,33 @@ import numpy as np ...@@ -22,39 +22,33 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.jit import dygraph_to_static_func from paddle.fluid.dygraph import declarative, ProgramTranslator
from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
from paddle.fluid.dygraph.io import VARIABLE_FILENAME
SEED = 2020
IMAGENET1000 = 1281167 IMAGENET1000 = 1281167
base_lr = 0.1 base_lr = 0.001
momentum_rate = 0.9 momentum_rate = 0.9
l2_decay = 1e-4 l2_decay = 1e-4
batch_size = 8 batch_size = 8
epoch_num = 1 epoch_num = 1
place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \ place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
else fluid.CPUPlace() else fluid.CPUPlace()
MODEL_SAVE_PATH = "./resnet.inference.model"
DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
program_translator = ProgramTranslator()
if fluid.is_compiled_with_cuda():
fluid.set_flags({'FLAGS_cudnn_deterministic': True})
def optimizer_setting(parameter_list=None): def optimizer_setting(parameter_list=None):
total_images = IMAGENET1000 optimizer = fluid.optimizer.Momentum(
step = int(math.ceil(float(total_images) / batch_size)) learning_rate=base_lr,
epochs = [30, 60, 90] momentum=momentum_rate,
bd = [step * e for e in epochs] regularization=fluid.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list)
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer return optimizer
...@@ -189,8 +183,8 @@ class ResNet(fluid.dygraph.Layer): ...@@ -189,8 +183,8 @@ class ResNet(fluid.dygraph.Layer):
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv))) initializer=fluid.initializer.Uniform(-stdv, stdv)))
@dygraph_to_static_func @declarative
def forward(self, inputs, label): def forward(self, inputs):
y = self.conv(inputs) y = self.conv(inputs)
y = self.pool2d_max(y) y = self.pool2d_max(y)
for bottleneck_block in self.bottleneck_block_list: for bottleneck_block in self.bottleneck_block_list:
...@@ -199,77 +193,144 @@ class ResNet(fluid.dygraph.Layer): ...@@ -199,77 +193,144 @@ class ResNet(fluid.dygraph.Layer):
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
pred = self.out(y) pred = self.out(y)
loss = fluid.layers.cross_entropy(input=pred, label=label) return pred
avg_loss_ = fluid.layers.mean(x=loss)
acc_top1_ = fluid.layers.accuracy(input=pred, label=label, k=1)
acc_top5_ = fluid.layers.accuracy(input=pred, label=label, k=5)
return pred, avg_loss_, acc_top1_, acc_top5_ def reader_decorator(reader):
def __reader__():
for item in reader():
img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield img, label
return __reader__
def train_resnet_in_static_mode():
def train(to_static):
""" """
Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode. Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
""" """
with fluid.dygraph.guard(place):
np.random.seed(SEED)
fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED
train_reader = paddle.batch(
reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
batch_size=batch_size,
drop_last=True)
data_loader = fluid.io.DataLoader.from_generator(
capacity=5, iterable=True)
data_loader.set_sample_list_generator(train_reader)
resnet = ResNet()
optimizer = optimizer_setting(parameter_list=resnet.parameters())
for epoch in range(epoch_num):
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
for batch_id, data in enumerate(data_loader()):
start_time = time.time()
img, label = data
pred = resnet(img)
loss = fluid.layers.cross_entropy(input=pred, label=label)
avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
avg_loss.backward()
optimizer.minimize(avg_loss)
resnet.clear_gradients()
total_loss += avg_loss
total_acc1 += acc_top1
total_acc5 += acc_top5
total_sample += 1
end_time = time.time()
if batch_id % 2 == 0:
print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
( epoch, batch_id, total_loss.numpy() / total_sample, \
total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
if batch_id == 10:
if to_static:
fluid.dygraph.jit.save(resnet, MODEL_SAVE_PATH)
else:
fluid.dygraph.save_dygraph(resnet.state_dict(),
DY_STATE_DICT_SAVE_PATH)
# avoid dataloader throw abort signaal
data_loader._reset()
break
return total_loss.numpy()
def predict_dygraph(data):
program_translator.enable(False)
with fluid.dygraph.guard(place):
resnet = ResNet()
model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
resnet.set_dict(model_dict)
resnet.eval()
pred_res = resnet(fluid.dygraph.to_variable(data))
return pred_res.numpy()
def predict_static(data):
exe = fluid.Executor(place) exe = fluid.Executor(place)
startup_prog = fluid.Program() [inference_program, feed_target_names,
main_prog = fluid.Program() fetch_targets] = fluid.io.load_inference_model(
MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
with fluid.program_guard(main_prog, startup_prog): pred_res = exe.run(inference_program,
feed={feed_target_names[0]: data},
fetch_list=fetch_targets)
img = fluid.data(name="img", shape=[None, 3, 224, 224], dtype="float32") return pred_res[0]
label = fluid.data(name="label", shape=[None, 1], dtype="int64")
label.stop_gradient = True
resnet = ResNet() def predict_dygraph_jit(data):
pred, avg_loss_, acc_top1_, acc_top5_ = resnet(img, label) with fluid.dygraph.guard(place):
optimizer = optimizer_setting(parameter_list=resnet.parameters()) resnet = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
optimizer.minimize(avg_loss_) resnet.eval()
exe.run(startup_prog) pred_res = resnet(data)
train_reader = paddle.batch( return pred_res.numpy()
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
for epoch in range(epoch_num):
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
for batch_id, data in enumerate(train_reader()):
start_time = time.time()
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1,
1)
avg_loss, acc_top1, acc_top5 = exe.run(
main_prog,
feed={"img": dy_x_data,
"label": y_data},
fetch_list=[avg_loss_, acc_top1_, acc_top5_])
total_loss += avg_loss
total_acc1 += acc_top1
total_acc5 += acc_top5
total_sample += 1
end_time = time.time()
if batch_id % 2 == 0:
print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
( epoch, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, end_time-start_time))
if batch_id == 10:
break
class TestResnet(unittest.TestCase): class TestResnet(unittest.TestCase):
def test_in_static_mode(self): def train(self, to_static):
train_resnet_in_static_mode() program_translator.enable(to_static)
return train(to_static)
def verify_predict(self):
image = np.random.random([1, 3, 224, 224]).astype('float32')
dy_pre = predict_dygraph(image)
st_pre = predict_static(image)
dy_jit_pre = predict_dygraph_jit(image)
self.assertTrue(
np.allclose(dy_pre, st_pre),
msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
self.assertTrue(
np.allclose(dy_jit_pre, st_pre),
msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
def test_resnet(self):
static_loss = self.train(to_static=True)
dygraph_loss = self.train(to_static=False)
self.assertTrue(
np.allclose(static_loss, dygraph_loss),
msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
dygraph_loss))
self.verify_predict()
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -24,6 +24,7 @@ from paddle.fluid.dygraph.base import to_variable ...@@ -24,6 +24,7 @@ from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
from paddle.fluid.dygraph import declarative from paddle.fluid.dygraph import declarative
from paddle.fluid.dygraph import ProgramTranslator from paddle.fluid.dygraph import ProgramTranslator
from paddle.fluid.dygraph.io import VARIABLE_FILENAME
SEED = 2020 SEED = 2020
np.random.seed(SEED) np.random.seed(SEED)
...@@ -32,6 +33,8 @@ BATCH_SIZE = 8 ...@@ -32,6 +33,8 @@ BATCH_SIZE = 8
EPOCH_NUM = 1 EPOCH_NUM = 1
PRINT_STEP = 2 PRINT_STEP = 2
STEP_NUM = 10 STEP_NUM = 10
MODEL_SAVE_PATH = "./se_resnet.inference.model"
DY_STATE_DICT_SAVE_PATH = "./se_resnet.dygraph"
place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \ place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
else fluid.CPUPlace() else fluid.CPUPlace()
...@@ -377,11 +380,60 @@ def train(train_reader, to_static): ...@@ -377,11 +380,60 @@ def train(train_reader, to_static):
step_idx += 1 step_idx += 1
if step_idx == STEP_NUM: if step_idx == STEP_NUM:
if to_static:
configs = fluid.dygraph.jit.SaveLoadConfig()
configs.output_spec = [pred]
fluid.dygraph.jit.save(se_resnext, MODEL_SAVE_PATH,
[img], configs)
else:
fluid.dygraph.save_dygraph(se_resnext.state_dict(),
DY_STATE_DICT_SAVE_PATH)
break break
return pred.numpy(), avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy( return pred.numpy(), avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(
) )
def predict_dygraph(data):
program_translator = ProgramTranslator()
program_translator.enable(False)
with fluid.dygraph.guard(place):
se_resnext = SeResNeXt()
model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
se_resnext.set_dict(model_dict)
se_resnext.eval()
label = np.random.random([1, 1]).astype("int64")
img = fluid.dygraph.to_variable(data)
label = fluid.dygraph.to_variable(label)
pred_res, _, _, _ = se_resnext(img, label)
return pred_res.numpy()
def predict_static(data):
exe = fluid.Executor(place)
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(
MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
pred_res = exe.run(inference_program,
feed={feed_target_names[0]: data},
fetch_list=fetch_targets)
return pred_res[0]
def predict_dygraph_jit(data):
with fluid.dygraph.guard(place):
se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
se_resnext.eval()
pred_res = se_resnext(data)
return pred_res.numpy()
class TestSeResnet(unittest.TestCase): class TestSeResnet(unittest.TestCase):
def setUp(self): def setUp(self):
self.train_reader = paddle.batch( self.train_reader = paddle.batch(
...@@ -390,6 +442,18 @@ class TestSeResnet(unittest.TestCase): ...@@ -390,6 +442,18 @@ class TestSeResnet(unittest.TestCase):
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
drop_last=True) drop_last=True)
def verify_predict(self):
image = np.random.random([1, 3, 224, 224]).astype('float32')
dy_pre = predict_dygraph(image)
st_pre = predict_static(image)
dy_jit_pre = predict_dygraph_jit(image)
self.assertTrue(
np.allclose(dy_pre, st_pre),
msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
self.assertTrue(
np.allclose(dy_jit_pre, st_pre),
msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
def test_check_result(self): def test_check_result(self):
pred_1, loss_1, acc1_1, acc5_1 = train( pred_1, loss_1, acc1_1, acc5_1 = train(
self.train_reader, to_static=False) self.train_reader, to_static=False)
...@@ -409,6 +473,8 @@ class TestSeResnet(unittest.TestCase): ...@@ -409,6 +473,8 @@ class TestSeResnet(unittest.TestCase):
np.allclose(acc5_1, acc5_2), np.allclose(acc5_1, acc5_2),
msg="static acc5: {} \ndygraph acc5: {}".format(acc5_1, acc5_2)) msg="static acc5: {} \ndygraph acc5: {}".format(acc5_1, acc5_2))
self.verify_predict()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -72,25 +72,40 @@ def create_paddle_case(op_type, callback): ...@@ -72,25 +72,40 @@ def create_paddle_case(op_type, callback):
class PaddleCls(unittest.TestCase): class PaddleCls(unittest.TestCase):
def setUp(self): def setUp(self):
self.op_type = op_type self.op_type = op_type
self.input_x = np.array([1, 2, 3, 4]) self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
self.input_y = np.array([1, 3, 2, 4]) self.input_y = np.array([1, 3, 2, 4]).astype(np.int64)
self.real_result = callback(self.input_x, self.input_y) self.real_result = callback(self.input_x, self.input_y)
self.place = fluid.CPUPlace()
if core.is_compiled_with_cuda():
self.place = paddle.CUDAPlace(0)
def test_api(self): def test_api(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[4], dtype='int64') x = fluid.data(name='x', shape=[4], dtype='int64')
y = fluid.layers.data(name='y', shape=[4], dtype='int64') y = fluid.data(name='y', shape=[4], dtype='int64')
op = eval("paddle.%s" % (self.op_type)) op = eval("paddle.%s" % (self.op_type))
out = op(x, y) out = op(x, y)
place = fluid.CPUPlace() exe = fluid.Executor(self.place)
if core.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
exe = fluid.Executor(place)
res, = exe.run(feed={"x": self.input_x, res, = exe.run(feed={"x": self.input_x,
"y": self.input_y}, "y": self.input_y},
fetch_list=[out]) fetch_list=[out])
self.assertEqual((res == self.real_result).all(), True) self.assertEqual((res == self.real_result).all(), True)
def test_broadcast_api_1(self):
with program_guard(Program(), Program()):
x = paddle.nn.data(name='x', shape=[1, 2, 1, 3], dtype='int32')
y = paddle.nn.data(name='y', shape=[1, 2, 3], dtype='int32')
op = eval("paddle.%s" % (self.op_type))
out = op(x, y)
exe = paddle.Executor(self.place)
input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
real_result = callback(input_x, input_y)
res, = exe.run(feed={"x": input_x,
"y": input_y},
fetch_list=[out])
self.assertEqual((res == real_result).all(), True)
def test_attr_name(self): def test_attr_name(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
x = fluid.layers.data(name='x', shape=[4], dtype='int32') x = fluid.layers.data(name='x', shape=[4], dtype='int32')
...@@ -104,6 +119,7 @@ def create_paddle_case(op_type, callback): ...@@ -104,6 +119,7 @@ def create_paddle_case(op_type, callback):
globals()[cls_name] = PaddleCls globals()[cls_name] = PaddleCls
create_paddle_case('less_than', lambda _a, _b: _a < _b)
create_paddle_case('less_equal', lambda _a, _b: _a <= _b) create_paddle_case('less_equal', lambda _a, _b: _a <= _b)
create_paddle_case('greater_than', lambda _a, _b: _a > _b) create_paddle_case('greater_than', lambda _a, _b: _a > _b)
create_paddle_case('greater_equal', lambda _a, _b: _a >= _b) create_paddle_case('greater_equal', lambda _a, _b: _a >= _b)
......
...@@ -39,7 +39,6 @@ class TestDistMnistNCCL2DGC(TestDistBase): ...@@ -39,7 +39,6 @@ class TestDistMnistNCCL2DGC(TestDistBase):
self._nccl2_mode = True self._nccl2_mode = True
self._use_dgc = True self._use_dgc = True
@unittest.skip(reason="Skip unstable ut")
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
...@@ -69,7 +68,6 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase): ...@@ -69,7 +68,6 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
self._nccl2_mode = True self._nccl2_mode = True
self._use_dgc = True self._use_dgc = True
@unittest.skip(reason="Skip unstable ut")
def test_dist_train(self): def test_dist_train(self):
import paddle.fluid as fluid import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.fleet as fleet
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
class TestFleetDGCOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def net(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {
"rampup_begin_step": 128,
"rampup_step": 100,
"sparsity": [0.996, 0.999]
}
return avg_cost, strategy
def test_dgc_optimizer(self):
avg_cost, strategy = self.net()
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertIn('dgc', ops)
self.assertIn('dgc_momentum', ops)
def test_dgc_not_apply_with_adam(self):
avg_cost, strategy = self.net()
optimizer = paddle.optimizer.Adam(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops)
self.assertNotIn('dgc_momentum', ops)
def test_dgc_not_apply_with_one_worker(self):
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
avg_cost, strategy = self.net()
optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
ops = [op.type for op in avg_cost.block.ops]
self.assertNotIn('dgc', ops)
self.assertNotIn('dgc_momentum', ops)
if __name__ == "__main__":
unittest.main()
#!/bin/bash
set -e
function test_launch_ps(){
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
if [[ ${WITH_GPU} == "OFF" ]]; then
test_launch_ps
exit 0
fi
test_launch_ps
# use default values
fleetrun multi_process.py
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019
export TRAINER_PORTS_NUM=2
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process.check_0.log"
file_1="multi_process.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
unset PADDLE_PORT
unset TRAINER_PORTS_NUM
echo ""
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
echo "train abort as planned"
fi
abort_str1="abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
if grep -q "$abort_str1" "$file_0"; then
echo "trainer 0 abort as planned"
else
echo "trainer 0 not abort as planned"
exit -1
fi
if [ ! -f $file_1 ]; then
echo "trainer 1 terminate as planned"
else
echo "trainer 1 not terminate as planned"
exit -1
fi
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
...@@ -33,8 +33,10 @@ class TestFleetUtil(unittest.TestCase): ...@@ -33,8 +33,10 @@ class TestFleetUtil(unittest.TestCase):
role_maker = None # should be fleet.PaddleCloudRoleMaker() role_maker = None # should be fleet.PaddleCloudRoleMaker()
optimize_ops = [] optimize_ops = []
params_grads = [] params_grads = []
util = factory._create_util(strategy, role_maker, optimize_ops, context = {}
params_grads) context["role_maker"] = role_maker
context["valid_strategy"] = strategy
util = factory._create_util(context)
self.assertEqual(util.role_maker, None) self.assertEqual(util.role_maker, None)
def test_get_util(self): def test_get_util(self):
......
...@@ -298,16 +298,15 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): ...@@ -298,16 +298,15 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
class TestDygraphDoubleGradVisitedUniq(TestCase): class TestDygraphDoubleGradVisitedUniq(TestCase):
def test_compare(self): def test_compare(self):
value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, value = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
5).astype("float32")
def model_f(input): def model_f(input):
linear = fluid.dygraph.Linear(5, 3, bias_attr=False) conv2d = fluid.dygraph.Conv2D(3, 2, 3)
for i in range(10): for i in range(10):
if i == 0: if i == 0:
out = linear(input) out = conv2d(input)
else: else:
out = out + linear(input) out = out + conv2d(input)
return out return out
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
...@@ -319,8 +318,14 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -319,8 +318,14 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
out = model_f(a) out = model_f(a)
dx=fluid.dygraph.grad(outputs=[out],inputs=[a],create_graph=True,retain_graph=True, \ dx = fluid.dygraph.grad(
only_inputs=True,allow_unused=False, backward_strategy=backward_strategy) outputs=[out],
inputs=[a],
create_graph=True,
retain_graph=True,
only_inputs=True,
allow_unused=False,
backward_strategy=backward_strategy)
grad_1 = dx[0].numpy() grad_1 = dx[0].numpy()
...@@ -334,7 +339,9 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -334,7 +339,9 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
grad_2 = a.gradient() grad_2 = a.gradient()
self.assertTrue(np.array_equal(grad_1, grad_2)) self.assertTrue(
np.allclose(
grad_1, grad_2, rtol=1.e-5, atol=1.e-8, equal_nan=True))
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.nn as nn
import paddle.fluid as fluid
import numpy as np
class LeNetDygraph(fluid.dygraph.Layer):
def __init__(self, num_classes=10, classifier_activation='softmax'):
super(LeNetDygraph, self).__init__()
self.num_classes = num_classes
self.features = nn.Sequential(
nn.Conv2D(
1, 6, 3, stride=1, padding=1),
nn.ReLU(),
nn.Pool2D(2, 'max', 2),
nn.Conv2D(
6, 16, 5, stride=1, padding=0),
nn.ReLU(),
nn.Pool2D(2, 'max', 2))
if num_classes > 0:
self.fc = nn.Sequential(
nn.Linear(400, 120),
nn.Linear(120, 84),
nn.Linear(
84, 10, act=classifier_activation))
def forward(self, inputs):
x = self.features(inputs)
if self.num_classes > 0:
x = fluid.layers.flatten(x, 1)
x = self.fc(x)
return x
def init_weights(layer):
if type(layer) == nn.Linear:
new_weight = paddle.fill_constant(
layer.weight.shape, layer.weight.dtype, value=0.9)
layer.weight.set_value(new_weight)
new_bias = paddle.fill_constant(
layer.bias.shape, layer.bias.dtype, value=-0.1)
layer.bias.set_value(new_bias)
elif type(layer) == nn.Conv2D:
new_weight = paddle.fill_constant(
layer.weight.shape, layer.weight.dtype, value=0.7)
layer.weight.set_value(new_weight)
new_bias = paddle.fill_constant(
layer.bias.shape, layer.bias.dtype, value=-0.2)
layer.bias.set_value(new_bias)
class TestLayerApply(unittest.TestCase):
def test_apply_init_weight(self):
with fluid.dygraph.guard():
net = LeNetDygraph()
net.apply(init_weights)
for layer in net.sublayers():
if type(layer) == nn.Linear:
np.testing.assert_allclose(layer.weight.numpy(), 0.9)
np.testing.assert_allclose(layer.bias.numpy(), -0.1)
elif type(layer) == nn.Conv2D:
np.testing.assert_allclose(layer.weight.numpy(), 0.7)
np.testing.assert_allclose(layer.bias.numpy(), -0.2)
if __name__ == '__main__':
unittest.main()
...@@ -37,7 +37,7 @@ class TestSortOnCPU(unittest.TestCase): ...@@ -37,7 +37,7 @@ class TestSortOnCPU(unittest.TestCase):
[[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]], [[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
[[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]], [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
dtype='float32') dtype='float32')
result, = exe.run(feed={'input': data}, fetch_list=[output[0]]) result, = exe.run(feed={'input': data}, fetch_list=[output])
np_result = np.sort(result) np_result = np.sort(result)
self.assertEqual((result == np_result).all(), True) self.assertEqual((result == np_result).all(), True)
...@@ -50,7 +50,7 @@ class TestSortOnCPU(unittest.TestCase): ...@@ -50,7 +50,7 @@ class TestSortOnCPU(unittest.TestCase):
[[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]], [[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
[[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]], [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
dtype='float32') dtype='float32')
result, = exe.run(feed={'input': data}, fetch_list=[output[0]]) result, = exe.run(feed={'input': data}, fetch_list=[output])
np_result = np.sort(result, axis=1) np_result = np.sort(result, axis=1)
self.assertEqual((result == np_result).all(), True) self.assertEqual((result == np_result).all(), True)
...@@ -75,7 +75,7 @@ class TestSortDygraph(unittest.TestCase): ...@@ -75,7 +75,7 @@ class TestSortDygraph(unittest.TestCase):
with imperative.guard(self.place): with imperative.guard(self.place):
var_x = imperative.to_variable(self.input_data) var_x = imperative.to_variable(self.input_data)
out = paddle.sort(var_x) out = paddle.sort(var_x)
self.assertEqual((np.sort(self.input_data) == out[0].numpy()).all(), self.assertEqual((np.sort(self.input_data) == out.numpy()).all(),
True) True)
def test_api_1(self): def test_api_1(self):
...@@ -84,5 +84,4 @@ class TestSortDygraph(unittest.TestCase): ...@@ -84,5 +84,4 @@ class TestSortDygraph(unittest.TestCase):
out = paddle.sort(var_x, axis=-1) out = paddle.sort(var_x, axis=-1)
self.assertEqual( self.assertEqual(
(np.sort( (np.sort(
self.input_data, axis=-1) == out[0].numpy()).all(), self.input_data, axis=-1) == out.numpy()).all(), True)
True)
...@@ -25,6 +25,7 @@ from . import datasets ...@@ -25,6 +25,7 @@ from . import datasets
from . import distributed from . import distributed
from . import vision from . import vision
from . import text from . import text
from . import utils
from . import device from . import device
from .device import * from .device import *
...@@ -41,6 +42,7 @@ __all__ = [ ...@@ -41,6 +42,7 @@ __all__ = [
'metrics', 'metrics',
'vision', 'vision',
'text', 'text',
'utils',
] + model.__all__ + device.__all__ ] + model.__all__ + device.__all__
monkey_patch_layer() monkey_patch_layer()
# copyright (c) 2020 paddlepaddle authors. all rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import numpy as np
import shutil
import tempfile
from paddle import fluid
from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
class LeNetDygraph(fluid.dygraph.Layer):
def __init__(self, num_classes=10, classifier_activation='softmax'):
super(LeNetDygraph, self).__init__()
self.num_classes = num_classes
self.features = Sequential(
Conv2D(
1, 6, 3, stride=1, padding=1),
ReLU(),
Pool2D(2, 'max', 2),
Conv2D(
6, 16, 5, stride=1, padding=0),
ReLU(),
Pool2D(2, 'max', 2))
if num_classes > 0:
self.fc = Sequential(
Linear(400, 120),
Linear(120, 84),
Linear(
84, 10, act=classifier_activation))
def forward(self, inputs):
x = self.features(inputs)
if self.num_classes > 0:
x = fluid.layers.flatten(x, 1)
x = self.fc(x)
return x
class TestUncombinedWeight2StateDict(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.save_dir = tempfile.mkdtemp()
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.save_dir)
def test_infer(self):
start_prog = fluid.Program()
train_prog = fluid.Program()
x = fluid.data(name='x', shape=[None, 1, 28, 28], dtype='float32')
with fluid.program_guard(train_prog, start_prog):
with fluid.unique_name.guard():
x = fluid.data(
name='x', shape=[None, 1, 28, 28], dtype='float32')
model = LeNetDygraph()
output = model.forward(x)
excutor = fluid.Executor()
excutor.run(start_prog)
test_prog = train_prog.clone(for_test=True)
fluid.io.save_params(excutor, self.save_dir, test_prog)
rand_x = np.random.rand(1, 1, 28, 28).astype('float32')
out = excutor.run(program=test_prog,
feed={'x': rand_x},
fetch_list=[output.name],
return_numpy=True)
state_dict = uncombined_weight_to_state_dict(self.save_dir)
key2key_dict = {
'features.0.weight': 'conv2d_0.w_0',
'features.0.bias': 'conv2d_0.b_0',
'features.3.weight': 'conv2d_1.w_0',
'features.3.bias': 'conv2d_1.b_0',
'fc.0.weight': 'linear_0.w_0',
'fc.0.bias': 'linear_0.b_0',
'fc.1.weight': 'linear_1.w_0',
'fc.1.bias': 'linear_1.b_0',
'fc.2.weight': 'linear_2.w_0',
'fc.2.bias': 'linear_2.b_0'
}
fluid.enable_imperative()
dygraph_model = LeNetDygraph()
converted_state_dict = dygraph_model.state_dict()
for k1, k2 in key2key_dict.items():
converted_state_dict[k1] = state_dict[k2]
dygraph_model.set_dict(converted_state_dict)
dygraph_model.eval()
dy_out = dygraph_model(fluid.dygraph.to_variable(rand_x))
np.testing.assert_allclose(dy_out.numpy(), out[0], atol=1e-5)
if __name__ == '__main__':
unittest.main()
...@@ -12,13 +12,171 @@ ...@@ -12,13 +12,171 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import inspect import inspect
import numpy as np import numpy as np
from collections import OrderedDict
from paddle import fluid from paddle import fluid
from paddle.fluid.framework import Variable from paddle.fluid.framework import Variable
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
__all__ = ['uncombined_weight_to_state_dict']
def uncombined_weight_to_state_dict(weight_dir):
"""
Convert uncombined weight which getted by using `fluid.io.save_params` or `fluid.io.save_persistables` to state_dict
Args:
weight_dir (str): weight direcotory path.
Returns:
OrderDict: weight dict.
Examples:
.. code-block:: python
import os
from paddle import fluid
from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
class LeNetDygraph(fluid.dygraph.Layer):
def __init__(self, num_classes=10, classifier_activation='softmax'):
super(LeNetDygraph, self).__init__()
self.num_classes = num_classes
self.features = Sequential(
Conv2D(
1, 6, 3, stride=1, padding=1),
ReLU(),
Pool2D(2, 'max', 2),
Conv2D(
6, 16, 5, stride=1, padding=0),
ReLU(),
Pool2D(2, 'max', 2))
if num_classes > 0:
self.fc = Sequential(
Linear(400, 120),
Linear(120, 84),
Linear(
84, 10, act=classifier_activation))
def forward(self, inputs):
x = self.features(inputs)
if self.num_classes > 0:
x = fluid.layers.flatten(x, 1)
x = self.fc(x)
return x
# save weight use fluid.io.save_params
save_dir = 'temp'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
start_prog = fluid.Program()
train_prog = fluid.Program()
x = fluid.data(name='x', shape=[None, 1, 28, 28], dtype='float32')
with fluid.program_guard(train_prog, start_prog):
with fluid.unique_name.guard():
x = fluid.data(
name='x', shape=[None, 1, 28, 28], dtype='float32')
model = LeNetDygraph()
output = model.forward(x)
excutor = fluid.Executor()
excutor.run(start_prog)
test_prog = train_prog.clone(for_test=True)
fluid.io.save_params(excutor, save_dir, test_prog)
# convert uncombined weight to state dict
state_dict = uncombined_weight_to_state_dict(save_dir)
key2key_dict = {
'features.0.weight': 'conv2d_0.w_0',
'features.0.bias': 'conv2d_0.b_0',
'features.3.weight': 'conv2d_1.w_0',
'features.3.bias': 'conv2d_1.b_0',
'fc.0.weight': 'linear_0.w_0',
'fc.0.bias': 'linear_0.b_0',
'fc.1.weight': 'linear_1.w_0',
'fc.1.bias': 'linear_1.b_0',
'fc.2.weight': 'linear_2.w_0',
'fc.2.bias': 'linear_2.b_0'
}
fluid.enable_imperative()
dygraph_model = LeNetDygraph()
converted_state_dict = dygraph_model.state_dict()
for k1, k2 in key2key_dict.items():
converted_state_dict[k1] = state_dict[k2]
# dygraph model load state dict which converted from uncombined weight
dygraph_model.set_dict(converted_state_dict)
"""
def _get_all_params_name(dir):
params_name = []
dir = os.path.expanduser(dir)
dir_len = len(dir)
for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
for fname in sorted(fnames):
path = os.path.join(root[dir_len:], fname)
params_name.append(path)
return params_name
class Load(fluid.dygraph.Layer):
def __init__(self):
super(Load, self).__init__()
def forward(self, filename):
weight = self.create_parameter(
shape=[1],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(0.0))
self._helper.append_op(
type='load',
inputs={},
outputs={'Out': [weight]},
attrs={'file_path': filename})
return weight
params_name_list = _get_all_params_name(weight_dir)
if not fluid.in_dygraph_mode():
dygraph_enabled = False
fluid.enable_imperative()
else:
dygraph_enabled = True
load = Load()
state_dict = OrderedDict()
for param_name in params_name_list:
param_path = os.path.join(weight_dir, param_name)
weight = load(param_path)
try:
weight = weight.numpy()
except Exception as e:
print(e)
state_dict[param_name] = weight
if not dygraph_enabled:
fluid.disable_imperative()
return state_dict
def to_list(value): def to_list(value):
if value is None: if value is None:
......
...@@ -46,8 +46,7 @@ def argsort(x, axis=-1, descending=False, name=None): ...@@ -46,8 +46,7 @@ def argsort(x, axis=-1, descending=False, name=None):
:alias_main: paddle.argsort :alias_main: paddle.argsort
:alias: paddle.argsort,paddle.tensor.argsort,paddle.tensor.search.argsort :alias: paddle.argsort,paddle.tensor.argsort,paddle.tensor.search.argsort
This OP sorts the input along the given axis, and returns sorted output This OP sorts the input along the given axis, and returns the corresponding index tensor for the sorted output values. The default sort algorithm is ascending, if you want the sort algorithm to be descending, you must set the :attr:`descending` as True.
data Varibale and its corresponding index Variable with the same shape as ``x``.
Args: Args:
x(Tensor): An input N-D Tensor with type float32, float64, int16, x(Tensor): An input N-D Tensor with type float32, float64, int16,
...@@ -84,26 +83,26 @@ def argsort(x, axis=-1, descending=False, name=None): ...@@ -84,26 +83,26 @@ def argsort(x, axis=-1, descending=False, name=None):
out2 = paddle.argsort(x=x, axis=0) out2 = paddle.argsort(x=x, axis=0)
out3 = paddle.argsort(x=x, axis=1) out3 = paddle.argsort(x=x, axis=1)
print(out1.numpy()) print(out1.numpy())
#[[[0 3 1 2] #[[[0 3 1 2]
# [0 1 2 3] # [0 1 2 3]
# [2 3 0 1]] # [2 3 0 1]]
# [[1 3 2 0] # [[1 3 2 0]
# [0 1 2 3] # [0 1 2 3]
# [2 0 3 1]]] # [2 0 3 1]]]
print(out2.numpy()) print(out2.numpy())
#[[[0 1 1 1] #[[[0 1 1 1]
# [0 0 0 0] # [0 0 0 0]
# [1 1 1 0]] # [1 1 1 0]]
# [[1 0 0 0] # [[1 0 0 0]
# [1 1 1 1] # [1 1 1 1]
# [0 0 0 1]]] # [0 0 0 1]]]
print(out3.numpy()) print(out3.numpy())
#[[[1 1 1 2] #[[[1 1 1 2]
# [0 0 2 0] # [0 0 2 0]
# [2 2 0 1]] # [2 2 0 1]]
# [[2 0 2 0] # [[2 0 2 0]
# [1 1 0 2] # [1 1 0 2]
# [0 2 1 1]]] # [0 2 1 1]]]
""" """
if in_dygraph_mode(): if in_dygraph_mode():
_, ids = core.ops.argsort(x, 'axis', axis, 'descending', descending) _, ids = core.ops.argsort(x, 'axis', axis, 'descending', descending)
...@@ -381,8 +380,7 @@ def sort(x, axis=-1, descending=False, name=None): ...@@ -381,8 +380,7 @@ def sort(x, axis=-1, descending=False, name=None):
:alias_main: paddle.sort :alias_main: paddle.sort
:alias: paddle.sort,paddle.tensor.sort,paddle.tensor.search.sort :alias: paddle.sort,paddle.tensor.sort,paddle.tensor.search.sort
This OP sorts the input along the given axis, and returns sorted output This OP sorts the input along the given axis, and returns the sorted output tensor. The default sort algorithm is ascending, if you want the sort algorithm to be descending, you must set the :attr:`descending` as True.
data Tensor and its corresponding index Tensor with the same shape as ``x``.
Args: Args:
x(Tensor): An input N-D Tensor with type float32, float64, int16, x(Tensor): An input N-D Tensor with type float32, float64, int16,
...@@ -397,9 +395,7 @@ def sort(x, axis=-1, descending=False, name=None): ...@@ -397,9 +395,7 @@ def sort(x, axis=-1, descending=False, name=None):
need for user to set this property. For more information, please need for user to set this property. For more information, please
refer to :ref:`api_guide_Name`. refer to :ref:`api_guide_Name`.
Returns: Returns:
tuple: A tuple of sorted data tensor(with the same shape and data Tensor: sorted tensor(with the same shape and data type as ``x``).
type as ``x``) and the sorted indices(with the same shape as ``x``
and with data type int64).
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
...@@ -417,38 +413,31 @@ def sort(x, axis=-1, descending=False, name=None): ...@@ -417,38 +413,31 @@ def sort(x, axis=-1, descending=False, name=None):
out1 = paddle.sort(x=x, axis=-1) out1 = paddle.sort(x=x, axis=-1)
out2 = paddle.sort(x=x, axis=0) out2 = paddle.sort(x=x, axis=0)
out3 = paddle.sort(x=x, axis=1) out3 = paddle.sort(x=x, axis=1)
print(out1[0].numpy()) print(out1.numpy())
#[[[5. 5. 8. 9.] #[[[5. 5. 8. 9.]
# [0. 0. 1. 7.] # [0. 0. 1. 7.]
# [2. 4. 6. 9.]] # [2. 4. 6. 9.]]
# [[2. 2. 4. 5.] # [[2. 2. 4. 5.]
# [4. 7. 7. 9.] # [4. 7. 7. 9.]
# [0. 1. 6. 7.]]] # [0. 1. 6. 7.]]]
print(out1[1].numpy()) print(out2.numpy())
#[[[0 3 1 2]
# [0 1 2 3]
# [2 3 0 1]]
# [[1 3 2 0]
# [0 1 2 3]
# [2 0 3 1]]]
print(out2[0].numpy())
#[[[5. 2. 4. 2.] #[[[5. 2. 4. 2.]
# [0. 0. 1. 7.] # [0. 0. 1. 7.]
# [1. 7. 0. 4.]] # [1. 7. 0. 4.]]
# [[5. 8. 9. 5.] # [[5. 8. 9. 5.]
# [4. 7. 7. 9.] # [4. 7. 7. 9.]
# [6. 9. 2. 6.]]] # [6. 9. 2. 6.]]]
print(out3[0].numpy()) print(out3.numpy())
#[[[0. 0. 1. 4.] #[[[0. 0. 1. 4.]
# [5. 8. 2. 5.] # [5. 8. 2. 5.]
# [6. 9. 9. 7.]] # [6. 9. 9. 7.]]
# [[1. 2. 0. 2.] # [[1. 2. 0. 2.]
# [4. 7. 4. 6.] # [4. 7. 4. 6.]
# [5. 7. 7. 9.]]] # [5. 7. 7. 9.]]]
""" """
if in_dygraph_mode(): if in_dygraph_mode():
out, ids = core.ops.argsort(x, 'axis', axis, 'descending', descending) out, _ = core.ops.argsort(x, 'axis', axis, 'descending', descending)
return out, ids return out
helper = LayerHelper("sort", **locals()) helper = LayerHelper("sort", **locals())
out = helper.create_variable_for_type_inference( out = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=False) dtype=x.dtype, stop_gradient=False)
...@@ -461,7 +450,7 @@ def sort(x, axis=-1, descending=False, name=None): ...@@ -461,7 +450,7 @@ def sort(x, axis=-1, descending=False, name=None):
'Indices': ids}, 'Indices': ids},
attrs={'axis': axis, attrs={'axis': axis,
'descending': descending}) 'descending': descending})
return out, ids return out
def where(condition, x, y, name=None): def where(condition, x, y, name=None):
......
...@@ -16,8 +16,9 @@ from .plot import Ploter ...@@ -16,8 +16,9 @@ from .plot import Ploter
from .profiler import ProfilerOptions from .profiler import ProfilerOptions
from .profiler import Profiler from .profiler import Profiler
from .profiler import get_profiler from .profiler import get_profiler
from .deprecated import deprecated
__all__ = ['dump_config', 'Ploter'] __all__ = ['dump_config', 'Ploter', 'deprecated']
#TODO: define new api under this directory #TODO: define new api under this directory
# __all__ = ['unique_name', # __all__ = ['unique_name',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
decorator to deprecate a function or class
"""
import warnings
import functools
import paddle
def deprecated(update_to="", since="", reason=""):
"""Decorate a function to signify its deprecation.
This function wraps a method that will soon be removed and does two things:
- The docstring of the API will be modified to include a notice
about deprecation."
- Raises a :class:`~exceptions.DeprecatedWarning` when old API is called.
Args:
since(str): The version at which the decorated method is considered deprecated.
update_to(str): The new API users should use.
reason(str): The reason why the API is deprecated.
Returns:
decorator: decorated function or class.
"""
def decorator(func):
"""construct warning message, and return a decorated function or class."""
assert isinstance(update_to, str), 'type of "update_to" must be str.'
assert isinstance(since, str), 'type of "since" must be str.'
assert isinstance(reason, str), 'type of "reason" must be str.'
_since = since.strip()
_update_to = update_to.strip()
_reason = reason.strip()
msg = 'API "{}.{}" is deprecated'.format(func.__module__, func.__name__)
if len(_since) > 0:
msg += " since {}".format(_since)
msg += ", and may be removed in future versions."
if len(_update_to) > 0:
assert _update_to.startswith(
"paddle."
), 'Argument update_to must start with "paddle.", your value is "{}"'.format(
update_to)
msg += ' Use "{}" instead.'.format(_update_to)
if len(_reason) > 0:
msg += "\n reason: {}".format(_reason)
@functools.wraps(func)
def wrapper(*args, **kwargs):
"""deprecated warning should be fired in 3 circumstances:
1. current version is develop version, i.e. "0.0.0", because we assume develop version is always the latest version.
2. since version is empty, in this case, API is deprecated in all versions.
3. current version is newer than since version.
"""
v_current = [int(i) for i in paddle.__version__.split(".")]
v_current += [0] * (4 - len(v_current))
v_since = [int(i) for i in _since.split(".")]
v_since += [0] * (4 - len(v_since))
if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
warnings.simplefilter('always',
DeprecationWarning) # turn off filter
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
warnings.simplefilter('default',
DeprecationWarning) # reset filter
return func(*args, **kwargs)
return wrapper
return decorator
...@@ -475,6 +475,11 @@ with redirect_stdout(): ...@@ -475,6 +475,11 @@ with redirect_stdout():
cmdclass={ cmdclass={
'install_headers': InstallHeaders, 'install_headers': InstallHeaders,
'install': InstallCommand, 'install': InstallCommand,
},
entry_points={
'console_scripts': [
'fleetrun = paddle.fleet.launch:launch'
]
} }
) )
......
...@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ...@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel RUN yum install -y gettext-devel sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel
COPY build_scripts /build_scripts COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh RUN bash build_scripts/build.sh
RUN bash build_scripts/install_nccl2.sh && \ RUN bash build_scripts/install_nccl2.sh && \
...@@ -22,6 +22,13 @@ RUN rm -rf build_scripts ...@@ -22,6 +22,13 @@ RUN rm -rf build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem ENV SSL_CERT_FILE=/opt/_internal/certs.pem
# git 2.17.1
RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
tar -xvf git-2.17.1.tar.gz && \
cd git-2.17.1 && \
./configure --prefix=/usr/local && \
make -j8 && make install
# for paddle # for paddle
RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \ tar -xz -C /usr/local && \
......
...@@ -3,9 +3,9 @@ function make_ubuntu_dockerfile(){ ...@@ -3,9 +3,9 @@ function make_ubuntu_dockerfile(){
dockerfile_name="Dockerfile.cuda10_cudnn7_gcc82_ubuntu16" dockerfile_name="Dockerfile.cuda10_cudnn7_gcc82_ubuntu16"
sed 's/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g' ./Dockerfile.ubuntu >${dockerfile_name} sed 's/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g' ./Dockerfile.ubuntu >${dockerfile_name}
sed -i 's#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g' ${dockerfile_name} sed -i 's#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g' ${dockerfile_name}
dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \ sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
sed -i 's#<install_gcc>#WORKDIR /usr/bin \ sed -i 's#<install_gcc>#WORKDIR /usr/bin \
COPY tools/dockerfile/build_scripts /build_scripts \ COPY tools/dockerfile/build_scripts /build_scripts \
RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \ RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
...@@ -24,6 +24,9 @@ function make_centos_dockerfile(){ ...@@ -24,6 +24,9 @@ function make_centos_dockerfile(){
sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name} sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name}
dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'` dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ && \
rm -rf /usr/include/NvInfer*" ${dockerfile_name}
sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \ sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册