提交 68e89f8a 编写于 作者: Z zlsh80826

merge PaddlePaddle/Paddle develop

......@@ -154,17 +154,10 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
C.PD_EnableMkldnnQuantizer(config.c)
}
func (config *AnalysisConfig) EnableMkldnnBfloat16() {
C.PD_EnableMkldnnBfloat16(config.c)
}
func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
}
func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
}
// SetModelBuffer
// ModelFromMemory
......
......@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
const std::string& fs_ugi) {
fs_name_ = fs_name;
fs_ugi_ = fs_ugi;
std::string cmd = std::string("hadoop fs");
std::string cmd = std::string("$HADOOP_HOME/bin/hadoop fs");
cmd += " -D fs.default.name=" + fs_name;
cmd += " -D hadoop.job.ugi=" + fs_ugi;
cmd += " -Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=500000";
paddle::framework::hdfs_set_command(cmd);
}
......
......@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
......@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
cc_test(exception_holder_test SRCS exception_holder_test.cc )
......
......@@ -18,7 +18,8 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
}
// Wait FetchOps.
ClearFetchOp(graph_, &fetch_ops);
for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
}
return fetches;
}
......@@ -162,8 +168,8 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
ir::Node *fetch_node =
graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_,
&local_exec_scopes_, return_merged);
auto *op = new FetchAsyncOpHandle(fetch_node, fetches, i, &local_scopes_,
&local_exec_scopes_, return_merged);
fetch_ops->emplace_back(op);
for (auto &p : places_) {
......@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
op->AddInput(var);
}
for (auto *var : vars) {
auto *op = var->GeneratedOp();
auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
if (compute_op) {
compute_op->SetLockAndRecordEventFree(false);
}
}
int dep = static_cast<int>(op->NotReadyInputSize());
(*op_deps)[op] = dep;
if (dep == 0) {
......@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
traced_ops_.emplace_back(op);
}
}
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node, FetchResultType *data,
size_t offset,
std::vector<Scope *> *local_scopes,
std::vector<Scope *> *local_exec_scopes,
bool return_merged)
: OpHandleBase(node),
data_(data),
offset_(offset),
local_scopes_(local_scopes),
local_exec_scopes_(local_exec_scopes),
return_merged_(return_merged) {}
FetchAsyncOpHandle::~FetchAsyncOpHandle() {}
void FetchAsyncOpHandle::RecordWaitEventOnCtx(
platform::DeviceContext *waited_ctx) {
PADDLE_THROW(platform::errors::PermissionDenied(
"No nodes need to wait FetchAsyncOp. Unexpceted Error."));
}
static void CheckTensorAttrs(const LoDTensor *tensor,
const proto::VarType::Type &type,
const DataLayout &layout, const DDim &dims,
const LoD &lod, const size_t offset) {
if (tensor->numel() && tensor->IsInitialized()) {
// step1: check type
PADDLE_ENFORCE_EQ(
type, tensor->type(),
platform::errors::InvalidArgument(
"The data type of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
DataTypeToString(type), DataTypeToString(tensor->type()), offset));
// step2: check layout
PADDLE_ENFORCE_EQ(
layout, tensor->layout(),
platform::errors::InvalidArgument(
"The layout of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
DataLayoutToString(layout), DataLayoutToString(tensor->layout()),
offset));
}
// step3: check dims
auto tensor_dims = tensor->dims();
PADDLE_ENFORCE_EQ(dims.size(), tensor_dims.size(),
platform::errors::InvalidArgument(
"The dimension sizes of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
dims, tensor_dims, offset));
for (int j = 1; j < dims.size(); j++) {
PADDLE_ENFORCE_EQ(dims[j], tensor_dims[j],
platform::errors::InvalidArgument(
"The dimensions of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices(%s vs %s). And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
dims, tensor_dims, offset));
}
// step4: check lod
PADDLE_ENFORCE_EQ(
lod.size(), tensor->lod().size(),
platform::errors::InvalidArgument(
"The LoD information of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
lod, tensor->lod(), offset));
}
static void TransData(const framework::Tensor *src_item,
framework::Tensor *dst_item,
const platform::DeviceContext &ctx) {
if (src_item->IsInitialized() && src_item->numel() > 0) {
if (platform::is_gpu_place(src_item->place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
#endif
} else {
TensorCopy(*src_item, platform::CPUPlace(), dst_item);
}
}
}
void FetchAsyncOpHandle::FetchMergedLodTensor(
const std::vector<const LoDTensor *> &src_lodtensors,
LoDTensor *dst_lodtensor) {
// calc dst type,layout,dim,lod and calc check dim
proto::VarType::Type new_type = proto::VarType::FP32;
framework::DataLayout new_layout;
framework::DDim new_dim;
LoD new_lod = src_lodtensors[0]->lod();
framework::DDim check_dim;
for (auto *t : src_lodtensors) {
if (t->numel() && t->IsInitialized()) {
check_dim = t->dims();
new_type = t->type();
new_layout = t->layout();
break;
}
}
bool find_first_dims = false;
for (auto *t : src_lodtensors) {
if (t->numel() && t->IsInitialized()) {
if (!find_first_dims) {
new_dim = t->dims();
find_first_dims = true;
} else {
new_dim[0] += t->dims()[0];
}
}
}
// check src type,layout,dim,lod consistence
for (size_t i = 1; i < src_lodtensors.size(); ++i) {
CheckTensorAttrs(src_lodtensors[i], new_type, new_layout, check_dim,
new_lod, offset_);
}
// set dst tensor
dst_lodtensor->Resize(new_dim);
dst_lodtensor->set_layout(src_lodtensors[0]->layout());
dst_lodtensor->set_lod(src_lodtensors[0]->lod());
if (platform::is_gpu_place(src_lodtensors[0]->place())) {
dst_lodtensor->mutable_data(platform::CUDAPinnedPlace(),
src_lodtensors[0]->type());
} else {
dst_lodtensor->mutable_data(platform::CPUPlace(),
src_lodtensors[0]->type());
}
// slice and memcpy
int begin = 0;
for (auto *src : src_lodtensors) {
int end = begin + src->dims()[0];
if (end == begin) {
continue;
}
auto dst = dst_lodtensor->Slice(begin, end);
TransData(src, &dst, *dev_ctxes_[src->place()]);
begin = end;
}
}
void FetchAsyncOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
// get src vars
auto &scopes = *local_exec_scopes_;
std::vector<Variable *> src_vars;
src_vars.reserve(inputs_.size());
for (size_t i = 0; i < inputs_.size(); ++i) {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
src_vars.emplace_back(var);
}
if (return_merged_) {
auto &val = BOOST_GET(FetchList, *data_);
if (src_vars[0]->IsType<LoDTensor>()) {
// to lodtensor type
std::vector<const LoDTensor *> src_lodtensors;
src_lodtensors.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
src_lodtensors.emplace_back(&src_vars[i]->Get<framework::LoDTensor>());
}
LoDTensor dst_lodtensor;
FetchMergedLodTensor(src_lodtensors, &dst_lodtensor);
val.at(offset_) = std::move(dst_lodtensor);
} else {
// to lodtensorarray type
std::vector<const LoDTensorArray *> src_lodtensor_arrays;
src_lodtensor_arrays.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
src_lodtensor_arrays.emplace_back(
&src_vars[i]->Get<framework::LoDTensorArray>());
}
LoDTensorArray dst_lodtensor_array;
dst_lodtensor_array.resize(src_lodtensor_arrays[0]->size());
for (size_t i = 0; i < dst_lodtensor_array.size(); ++i) {
std::vector<const LoDTensor *> src_lodtensors;
src_lodtensors.reserve(src_lodtensor_arrays.size());
for (size_t j = 0; j < src_lodtensor_arrays.size(); ++j) {
src_lodtensors.emplace_back(&(*src_lodtensor_arrays[j])[i]);
}
FetchMergedLodTensor(src_lodtensors, &dst_lodtensor_array[i]);
}
val.at(offset_) = std::move(dst_lodtensor_array);
}
} else {
auto &val = BOOST_GET(FetchUnmergedList, *data_);
auto &dst_tensors = val.at(offset_);
dst_tensors.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
if (src_vars[i]->IsType<LoDTensor>()) {
auto &t = src_vars[i]->Get<framework::LoDTensor>();
LoDTensor item;
TransData(&t, &item, *dev_ctxes_[t.place()]);
dst_tensors.emplace_back(std::move(item));
} else {
auto &t = src_vars[i]->Get<framework::LoDTensorArray>();
LoDTensorArray item;
item.resize(t.size());
for (size_t j = 0; j < t.size(); ++j) {
TransData(&t[j], &item[j], *dev_ctxes_[t[j].place()]);
}
dst_tensors.emplace_back(std::move(item));
}
}
}
}
bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FetchAsyncOpHandle : public OpHandleBase {
public:
FetchAsyncOpHandle(ir::Node *node, FetchResultType *data, size_t offset,
std::vector<Scope *> *local_scopes,
std::vector<Scope *> *local_exec_scopes,
bool return_merged);
~FetchAsyncOpHandle();
void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
std::string Name() const override;
bool IsMultiDeviceTransfer() override;
protected:
void RunImpl() override;
std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
void FetchMergedLodTensor(
const std::vector<const LoDTensor *> &src_lodtensors,
LoDTensor *dst_lodtensor);
private:
FetchResultType *data_;
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<Scope *> *local_exec_scopes_;
bool return_merged_;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data,
FetchOpHandle::~FetchOpHandle() {}
void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
PADDLE_THROW(platform::errors::PermissionDenied(
"No nodes need to wait FetchOp. Unexpceted Error."));
}
static void CheckDims(const framework::DDim &tensor_dims,
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle {
namespace framework {
......@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
if (fetch_ops->empty()) return;
for (auto& op : *fetch_ops) {
PADDLE_ENFORCE_NOT_NULL(
dynamic_cast<FetchOpHandle*>(op),
"The input ops of ClearFetchOp function should be FetchOpHandle.");
PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
true,
"The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle.");
for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var);
}
......
......@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
float* g = g_tensor->data<float>();
if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
int dim = emb_dim + offset;
int dim = emb_dim;
Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / dim, dim);
......@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
#endif
}
void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->load_with_whitelist(table_id, path,
std::to_string(mode));
ret.wait();
if (ret.get() != 0) {
LOG(ERROR) << "load model of table id: " << table_id
<< ", from path: " << path << " failed";
}
#else
VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
#endif
}
void FleetWrapper::SaveModel(const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
......@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
#endif
}
int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
const int mode,
const std::string& whitelist_path) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save_with_whitelist(
table_id, path, std::to_string(mode), whitelist_path);
ret.wait();
int32_t feasign_cnt = ret.get();
if (feasign_cnt == -1) {
LOG(ERROR) << "table save cache failed";
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
return feasign_cnt;
#else
VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
return -1;
#endif
}
void FleetWrapper::ShrinkSparseTable(int table_id) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
......
......@@ -273,6 +273,11 @@ class FleetWrapper {
// save cache model
// cache model can speed up online predict
int32_t SaveCache(int table_id, const std::string& path, const int mode);
// save sparse table filtered by user-defined whitelist
int32_t SaveWithWhitelist(int table_id, const std::string& path,
const int mode, const std::string& whitelist_path);
void LoadWithWhitelist(const uint64_t table_id, const std::string& path,
const int mode);
// copy feasign key/value from src_table_id to dest_table_id
int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
// copy feasign key/value from src_table_id to dest_table_id
......
......@@ -21,6 +21,8 @@
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"
DECLARE_bool(use_mkldnn);
namespace paddle {
namespace imperative {
......@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_backward) {
VLOG(1) << "Trace Op: " << type;
if (FLAGS_use_mkldnn) {
attrs["use_mkldnn"] = true;
}
auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
const auto& op_info = op->Info();
auto* attr_checker = op_info.Checker();
......
......@@ -217,17 +217,6 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
Update();
}
void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
use_mkldnn_bfloat16_ = false;
#endif
Update();
}
MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
"MkldnnQuantizer was not enabled yet.");
......@@ -341,12 +330,6 @@ void AnalysisConfig::Update() {
#endif
}
if (use_mkldnn_bfloat16_) {
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnBfloat16();
#endif
}
#ifdef PADDLE_WITH_MKLDNN
// Do not optimize when mkldnn is on
if (enable_memory_optim_ && !use_mkldnn_) {
......@@ -415,7 +398,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << ";";
ss << use_mkldnn_quantizer_;
ss << use_mkldnn_bfloat16_;
ss << model_from_memory_;
ss << with_profile_;
......
......@@ -485,25 +485,4 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
}
#endif
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
#else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
#endif
}
#endif
TEST(AnalysisPredictor, bf16_pass_strategy) {
std::vector<std::string> passes;
PassStrategy passStrategy(passes);
passStrategy.EnableMkldnnBfloat16();
}
} // namespace paddle
......@@ -401,19 +401,6 @@ struct PD_INFER_DECL AnalysisConfig {
///
void EnableMkldnnQuantizer();
///
/// \brief Turn on MKLDNN bfloat16.
///
///
void EnableMkldnnBfloat16();
///
/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
///
/// \return bool Whether to use the MKLDNN Bfloat16.
///
bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
///
/// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled.
......@@ -605,7 +592,6 @@ struct PD_INFER_DECL AnalysisConfig {
int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
bool use_mkldnn_bfloat16_{false};
// If the config is already used on a predictor, it becomes invalid.
// Any config can only be used with one predictor.
......
......@@ -143,10 +143,6 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
LOG(ERROR) << "GPU not support MKL-DNN quantization";
}
void GpuPassStrategy::EnableMkldnnBfloat16() {
LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
}
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
......@@ -229,12 +225,4 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
#endif
}
void CpuPassStrategy::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
use_mkldnn_bfloat16_ = false;
#endif
}
} // namespace paddle
......@@ -132,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable MKLDNN quantize optimization.
virtual void EnableMkldnnQuantizer() {}
/// \brief Enable MKLDNN bfloat16.
virtual void EnableMkldnnBfloat16() {}
/// \brief Check if we are using gpu.
/// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; }
......@@ -164,7 +161,6 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
use_gpu_ = other.use_gpu_;
use_mkldnn_ = other.use_mkldnn_;
use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
}
/// \brief Default destructor.
virtual ~CpuPassStrategy() = default;
......@@ -178,13 +174,9 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
/// \brief Enable MKLDNN quantize optimization.
void EnableMkldnnQuantizer() override;
/// \brief Enable MKLDNN bfloat16.
void EnableMkldnnBfloat16() override;
protected:
/// \cond Protected
bool use_mkldnn_quantizer_{false};
bool use_mkldnn_bfloat16_{false};
/// \endcond
};
......@@ -213,9 +205,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Not supported in GPU mode yet.
void EnableMkldnnQuantizer() override;
/// \brief Not supported in GPU mode yet.
void EnableMkldnnBfloat16() override;
/// \brief Default destructor.
virtual ~GpuPassStrategy() = default;
......
......@@ -235,12 +235,6 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
const char* prog_buffer,
size_t prog_buffer_size,
......
......@@ -207,18 +207,6 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
return config->config.mkldnn_quantizer_enabled();
}
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) {
......
......@@ -187,6 +187,14 @@ void TensorRTEngine::FreezeNetwork() {
Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
}
infer_builder_config_->addOptimizationProfile(optim_profile_);
infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8) {
// Due to a bug of TRT, we must set precision BuilderFlag to kFP16 before
// kINT8 here to perform INT8 inference.
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
}
if (WithFp16()) {
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
if (disable_trt_plugin_fp16()) {
......
......@@ -45,12 +45,13 @@ struct SimpleOpTypeSetTeller : public Teller {
private:
// use this set for no calib int8.
std::unordered_set<std::string> int8_teller_set{"matmul",
std::unordered_set<std::string> int8_teller_set{"mul",
"conv2d",
"pool2d",
"relu",
"depthwise_conv2d",
"softmax",
"sigmoid",
"batch_norm",
"elementwise_add",
"leaky_relu",
......
......@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
auto stri_0 = expr_builder.constant(strides_[0]);
auto stri_1 = expr_builder.constant(strides_[1]);
auto one_value = expr_builder.constant(1);
auto tmp1_0 =
expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1);
auto tmp1_1 =
expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
auto tmp2_0 = expr_builder.constant(
(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1);
auto tmp2_1 = expr_builder.constant(
(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1);
auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[2], *stri_0);
auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[3], *stri_1);
auto ceil_tmp =
expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
auto ceil1_tmp =
expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
if (!ceil_mode_) {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*a_d, *tmp1_0);
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*b_d, *tmp1_1);
output.d[2] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[2], *v0_tmp),
*stri_0),
*one_value);
output.d[3] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[3], *v1_tmp),
*stri_1),
*one_value);
} else {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*a_d, *tmp2_0);
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*b_d, *tmp2_1);
output.d[2] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[2], *ceil_tmp),
*stri_0),
*one_value);
output.d[3] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[3], *ceil1_tmp),
*stri_1),
*one_value);
}
return output;
......
......@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
}
......
......@@ -24,6 +24,39 @@ namespace tensorrt {
namespace plugin {
#if IS_TRT_VERSION_GE(6000)
StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
: axis_(axis), num_stack_(num_stack) {}
StackPluginDynamic::StackPluginDynamic(void const* serial_data,
size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &num_stack_);
}
StackPluginDynamic::~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
return new StackPluginDynamic(axis_, num_stack_);
}
const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
int StackPluginDynamic::getNbOutputs() const { return 1; }
int StackPluginDynamic::initialize() { return 0; }
size_t StackPluginDynamic::getSerializationSize() const {
size_t serialize_size = 0;
serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_);
return serialize_size;
}
void StackPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_);
}
nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) {
......@@ -37,6 +70,20 @@ nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
return output;
}
void StackPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
size_t StackPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
return num_stack_ * sizeof(uintptr_t);
}
void StackPluginDynamic::destroy() { delete this; }
void StackPluginDynamic::terminate() {}
bool StackPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
int nb_outputs) {
......@@ -109,8 +156,11 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
lead_unit *= out_dims.d[i];
}
cudaMemcpyAsync(reinterpret_cast<void*>(in_ptr_gpu_),
reinterpret_cast<const void* const>(inputs),
PADDLE_ENFORCE_EQ(
out_dims.d[axis_], num_stack_,
platform::errors::InvalidArgument("number of stack axis should be same"));
cudaMemcpyAsync(workspace, reinterpret_cast<const void* const>(inputs),
sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
stream);
......@@ -122,13 +172,13 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
if (infer_type == nvinfer1::DataType::kFLOAT) {
float* output = static_cast<float*>(outputs[0]);
StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const float* const*>(in_ptr_gpu_), output, num_stacks,
reinterpret_cast<const float* const*>(workspace), output, num_stacks,
base_unit);
} else if (infer_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16
__half* output = static_cast<__half*>(outputs[0]);
StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const __half* const*>(in_ptr_gpu_), output, num_stacks,
reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
base_unit);
#else
PADDLE_THROW(platform::errors::Fatal(
......@@ -141,6 +191,54 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
}
return cudaGetLastError() != cudaSuccess;
}
StackPluginDynamicCreator::StackPluginDynamicCreator() {}
const char* StackPluginDynamicCreator::getPluginName() const {
return "stack_plugin";
}
const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
const nvinfer1::PluginFieldCollection*
StackPluginDynamicCreator::getFieldNames() {
return &field_collection_;
}
nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) {
int axis = -1;
int num_stack = -1;
for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name);
if (name == "axis") {
axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else {
PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
name +
"' when creating stack op plugin."));
}
}
return new StackPluginDynamic(axis, num_stack);
}
nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
const char* name, const void* serial_data, size_t serial_length) {
auto plugin = new StackPluginDynamic(serial_data, serial_length);
return plugin;
}
void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
plugin_namespace_ = lib_namespace;
}
const char* StackPluginDynamicCreator::getPluginNamespace() const {
return plugin_namespace_.c_str();
}
#endif
} // namespace plugin
......
......@@ -28,68 +28,24 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000)
class StackPluginDynamic : public DynamicPluginTensorRT {
public:
StackPluginDynamic(int axis, int num_stack)
: axis_(axis), num_stack_(num_stack) {
init();
}
StackPluginDynamic(void const* serialData, size_t serialLength) {
DeserializeValue(&serialData, &serialLength, &axis_);
DeserializeValue(&serialData, &serialLength, &num_stack_);
init();
}
~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new StackPluginDynamic(axis_, num_stack_);
}
void init() {
int device_id;
cudaGetDevice(&device_id);
in_ptr_tensor_.Resize({num_stack_});
in_ptr_gpu_ =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
}
const char* getPluginType() const override { return "stack_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override { return 0; }
size_t getSerializationSize() const override {
size_t serialize_size = 0;
serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_);
return serialize_size;
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_);
}
explicit StackPluginDynamic(int axis, int num_stack);
StackPluginDynamic(void const* serial_data, size_t serial_length);
~StackPluginDynamic();
nvinfer1::IPluginV2DynamicExt* clone() const override;
nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
int nbOutputs) override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int nbOutputs) const override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
......@@ -99,68 +55,39 @@ class StackPluginDynamic : public DynamicPluginTensorRT {
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
void destroy() override { delete this; }
const char* getPluginType() const override;
int getNbOutputs() const override;
int initialize() override;
void terminate() override;
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
void destroy() override;
private:
int axis_;
int num_stack_;
framework::Tensor in_ptr_tensor_;
int64_t* in_ptr_gpu_;
};
class StackPluginV2Creator : public nvinfer1::IPluginCreator {
class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
StackPluginV2Creator() {}
const char* getPluginName() const override { return "stack_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
StackPluginDynamicCreator();
const char* getPluginName() const override;
const char* getPluginVersion() const override;
const nvinfer1::PluginFieldCollection* getFieldNames() override;
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
int axis = -1;
int num_stack = -1;
for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name);
if (name == "axis") {
axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else {
PADDLE_THROW(
platform::errors::Fatal("Meet an unknown plugin field '" + name +
"' when creating stack op plugin."));
}
}
return new StackPluginDynamic(axis, num_stack);
}
const char* name, const nvinfer1::PluginFieldCollection* fc) override;
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new StackPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
size_t serial_length) override;
void setPluginNamespace(const char* lib_namespace) override;
const char* getPluginNamespace() const override;
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(StackPluginV2Creator);
REGISTER_TRT_PLUGIN_V2(StackPluginDynamicCreator);
#endif
} // namespace plugin
......
......@@ -431,9 +431,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
endif()
inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
......
......@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) {
PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(!bfloat16_enable) << "NO";
PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
false);
bool trt_enable = PD_TensorrtEngineEnabled(config);
......
......@@ -88,9 +88,6 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
PD_EnableMkldnnQuantizer(config);
bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
CHECK(quantizer_enable) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(bfloat16_enable) << "NO";
PD_SetMkldnnCacheCapacity(config, 0);
PD_SetModel(config, prog_file.c_str(), params_file.c_str());
PD_DeleteAnalysisConfig(config);
......
......@@ -25,12 +25,20 @@ namespace inference {
TEST(quant_int8, resnet50) {
std::string model_dir = FLAGS_infer_model;
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.EnableUseGpu(1000, 0);
config.SetModel(model_dir);
config.SwitchUseFeedFetchOps(false);
config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8,
false, false);
std::map<std::string, std::vector<int>> min_input_shape = {
{"image", {1, 1, 3, 3}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"image", {1, 1, 10, 10}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"image", {1, 1, 3, 3}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames();
int channels = 1;
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
......@@ -243,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Operator. Computes cosine of x element-wise.
Input range is `(-inf, inf)` and output range is `[-1,1]`.
Return `nan` if input is out of boundary.
$$out = cos(x)$$
......@@ -341,7 +341,9 @@ $$out = \cos^{-1}(x)$$
class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of asin operator");
AddInput("X",
"Input of asin operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of asin operator");
AddComment(R"DOC(
Arcsine Operator.
......@@ -355,7 +357,9 @@ $$out = \sin^{-1}(x)$$
class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of atan operator");
AddInput("X",
"Input of atan operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of atan operator");
AddComment(R"DOC(
Arctangent Operator.
......@@ -1231,3 +1235,34 @@ REGISTER_OP_CPU_KERNEL(
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::AbsGradFunctor<int64_t>>);
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(leaky_relu)
.AddCheckpoint(
R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"leaky_relu calculate formula before checkponit: out = max(x, "
"alpha * x); after checkpoint: out = x if x > 0 else alpha * "
"x"));
REGISTER_OP_VERSION(hard_shrink)
.AddCheckpoint(
R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"hard_shrink calculate formula before checkponit: out = x * "
"((x < -threshold) + (x > threshold)); after checkpoint: out = "
"x * (((x < -threshold) + (x > threshold)) > 0)"));
REGISTER_OP_VERSION(softplus)
.AddCheckpoint(
R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
" softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
" stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("beta", "The beta value of the new formula", 1.0f)
.NewAttr("threshold", "The threshold value of the new formula",
20.0f));
/* ========================================================================== */
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR(
......@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
uint8_t>);
REGISTER_OP_VERSION(arg_max)
.AddCheckpoint(
R"ROC(
Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("flatten",
"In order to compute the argmax over the flattened array "
"when the "
"argument `axis` in python API is None.",
false)
.ModifyAttr(
"dtype",
"change the default value of dtype, the older version "
"is -1, means return the int64 indices."
"The new version is 3, return the int64 indices directly."
"And supporting the dtype of -1 in new version.",
3));
......@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor {
auto axis = ctx.Attr<int64_t>("axis");
auto keepdims = ctx.Attr<bool>("keepdims");
const bool& flatten = ctx.Attr<bool>("flatten");
// paddle do not have the scalar tensor, just return the shape [1] tensor
if (flatten) keepdims = true;
// if flatten, will construct the new dims for the cacluate
framework::DDim x_dims;
......@@ -164,15 +166,30 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
platform::errors::InvalidArgument(
"'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank;
if (ctx->IsRuntime()) {
const int& dtype = ctx->Attrs().Get<int>("dtype");
if (dtype == framework::proto::VarType::INT32) {
int64_t all_element_num = 0;
if (flatten) {
all_element_num = framework::product(x_dims);
} else {
all_element_num = x_dims[axis];
}
PADDLE_ENFORCE_LE(
all_element_num, INT_MAX,
"The element num of the argmin/argmax input at axis is "
"%d, is larger than int32 maximum value:%d, you must "
"set the dtype of argmin/argmax to 'int64'.",
all_element_num, INT_MAX);
}
}
std::vector<int64_t> vec;
if (flatten) {
// if is flatten, will return the only on element
if (keepdims) {
vec.emplace_back(static_cast<int64_t>(1));
}
vec.emplace_back(static_cast<int64_t>(1));
} else {
auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank;
for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
if (keepdims) {
vec.emplace_back(static_cast<int64_t>(1));
......@@ -194,10 +211,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "Output tensor.");
AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
AddAttr<bool>("flatten",
"Flatten the input value, and search the min or max indices")
.SetDefault(false);
AddAttr<int>("dtype",
"(int, 3), the dtype of indices, the indices dtype must be "
"int32, int64."
"default dtype is int64, and proto value is 3.")
.SetDefault(3);
AddComment(string::Sprintf(R"DOC(
%s Operator.
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR(
......@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
int16_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
uint8_t>);
REGISTER_OP_VERSION(arg_min)
.AddCheckpoint(
R"ROC(
Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("flatten",
"In order to compute the argmin over the flattened array "
"when the "
"argument `axis` in python API is None.",
false)
.ModifyAttr(
"dtype",
"change the default value of dtype, the older version "
"is -1, means return the int64 indices."
"The new version is 3, return the int64 indices directly."
"And supporting the dtype of -1 in new version.",
3));
......@@ -31,6 +31,10 @@ struct BernoulliCudaFunctor {
__host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
__host__ __device__ T operator()(const unsigned int n, const T p) const {
// NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
// lines of error messages if, and it should be refined.
PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
"The probability should be >=0 and <= 1, but got %f", p);
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(0.0, 1.0);
......
......@@ -25,10 +25,12 @@ namespace operators {
template <typename T>
inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
"The probability should be <= 1, but got %f", p));
PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
"The probability should be >= 1, but got %f", p));
PADDLE_ENFORCE_LE(p, 1.0,
platform::errors::OutOfRange(
"The probability should be <= 1, but got %f", p));
PADDLE_ENFORCE_GE(p, 0.0,
platform::errors::OutOfRange(
"The probability should be >= 0, but got %f", p));
return static_cast<T>(rand < p);
}
......
......@@ -56,7 +56,7 @@ endif()
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op)
DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
......
......@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
&trainer_id);
}
void ProcGetRecvResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) {
VLOG(4) << "ProcGetRecvResponse";
framework::Variable* outvar = nullptr;
int trainer_id;
DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
&trainer_id);
}
template <typename T>
void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
::grpc::Slice slice(proto.ByteSizeLong());
......@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
return h;
}
VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string send_var_name_val = send_var_name;
const std::string recv_var_name_val = recv_var_name;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
const std::string method = kSendAndRecvRPC;
VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
<< send_var_name_val << " Recv_var_name: " << recv_var_name_val;
int retry_times_ = 0;
while (true) {
SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
VarHandlePtr h_recv(
new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
s->RecvPrepare(h_recv);
framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
p_scope, p_ctx, s, method, h, this] {
auto* send_var = p_scope->FindVar(send_var_name_val);
send_var->GetMutable<framework::LoDTensor>()->set_lod({});
::grpc::ByteBuffer buf;
VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
<< send_var_name_val
<< " recv_var_name_val: " << recv_var_name_val;
SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
recv_var_name_val, trainer_id_, table_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = ProcGetRecvResponse;
platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
});
req_count_++;
if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
h->Wait();
if (h->should_retry) {
VLOG(3) << "rpc call failed, retry times " << retry_times_;
retry_times_++;
std::random_device rd;
std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
continue;
}
}
return h;
}
}
bool GRPCClient::Wait() {
std::unique_lock<std::mutex> lk(sync_mutex_);
sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
......
......@@ -53,6 +53,8 @@ namespace distributed {
void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor {
public:
BaseProcessor() { context_ = nullptr; }
......@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
RequestGetCallBack response_call_back_ = ProcGetResponse;
};
class SendAndRecvProcessor : public BaseProcessor {
public:
explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(), stub_g_(ch) {}
virtual ~SendAndRecvProcessor() {}
void ProcessImpl() override {
if (response_call_back_) {
response_call_back_(*var_h_recv_.get(), reply_);
var_h_recv_->Finish(true);
}
}
void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
::grpc::ByteBuffer reply_;
::grpc::GenericStub stub_g_;
RequestGetCallBack response_call_back_ = ProcGetResponse;
VarHandlePtr var_h_recv_;
};
class BatchBarrierProcessor : public BaseProcessor {
public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
......@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
......
......@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
}
std::string header;
request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]);
......@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
}
#endif
PADDLE_ENFORCE_NOT_NULL(payload);
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) {
......@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
......@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
*trainer_id = resp.GetTrainerId();
}
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE_EQ(
resp.Parse(msg), 0,
platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
*var = resp.GetRecvVar();
*trainer_id = resp.GetTrainerId();
}
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id);
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id);
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
namespace paddle {
namespace operators {
namespace distributed {
enum CallStatus { PROCESS = 0, FINISH };
// reference:
......@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};
class RequestSendAndRecv final : public RequestBase {
public:
explicit RequestSendAndRecv(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new GRPCVariableResponse(
request_handler->scope(), request_handler->dev_ctx(),
request_handler->distributed_mode()));
int method_id =
static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestSendAndRecv() {}
std::string GetReqName() override { return request_->Varname(); }
void Process() override {
std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname();
std::string table_name = request_->TableName();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name << " trainer: " << trainer_id;
auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = nullptr;
request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
out_var_name, table_name);
SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
&reply_);
Finish(reply_, &responder_);
}
protected:
std::shared_ptr<GRPCVariableResponse> request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
};
void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is waiting server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_);
......@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestNotify) {
b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestSendAndRecv) {
b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
} else {
PADDLE_ENFORCE(false, "not supported rpc");
}
......
......@@ -85,10 +85,12 @@ enum class GrpcMethod {
kGetMonomerVariable,
kGetMonomerBarrier,
kRequestNotify,
kRequestSendAndRecv,
// when you add new handler, change kGrpcNumMethods at the same time!
};
static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kRequestNotify) + 1;
static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) {
......@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/CheckpointNotify";
case GrpcMethod::kRequestNotify:
return "/sendrecv.SendRecvService/DistributeNotify";
case GrpcMethod::kRequestSendAndRecv:
return "/sendrecv.SendRecvService/SendAndRecvVariable";
}
// Shouldn't be reached.
......
......@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
constexpr char kRequestNotify[] = "RequestNotify";
constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
constexpr char kSendRPC[] = "SendRPC";
constexpr char kGetRPC[] = "GetRPC";
......@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
constexpr int64_t kPrefetchTimeout = 60000;
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
......
......@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
return true;
}
bool RequestSendAndRecvHandler::Handle(const std::string &varname,
framework::Scope *Scope,
framework::Variable *var,
framework::Variable **outvar,
const int trainer_id,
const std::string &out_var_name,
const std::string &table_name) {
VLOG(3) << "SendAndRecvHandle: " << varname
<< " out_var_name: " << out_var_name
<< " , trainer_id: " << trainer_id;
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
*outvar = Scope->FindVar(out_var_name);
return true;
}
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
std::unordered_map<int, int64_t> decay_counters;
};
class RequestSendAndRecvHandler final : public RequestHandler {
public:
explicit RequestSendAndRecvHandler(int distributed_mode)
: RequestHandler(distributed_mode) {}
virtual ~RequestSendAndRecvHandler() {}
bool Handle(const std::string& varname, framework::Scope* Scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
};
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -85,6 +85,12 @@ class RPCClient {
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendAndRecv(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& send_var_name,
const std::string& recv_var_name, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
......
......@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
namespace distributed = paddle::operators::distributed;
USE_NO_KERNEL_OP(lookup_sparse_table_read);
USE_OP(scale);
std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block);
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
framework::VariableNameMap output({{"Output", {"out"}}});
auto op = block->AppendOp();
op->SetType("lookup_sparse_table_read");
op->SetInput("W", {"w"});
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"});
op->SetAttr("tablename", {"w"});
op->SetAttr("value_names", {"Param"});
auto& out = *root_block->Var("out");
framework::OpDesc* op = block->AppendOp();
op->SetType("scale");
op->SetInput("X", {"x"});
op->SetOutput("Out", {"res"});
op->SetAttr("scale", 0.5f);
auto& out = *root_block->Var("res");
out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetShape({10, 10});
out.SetShape({1, 10});
return block;
}
......@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::LoDTensor>();
auto x_var = scope->Var("x");
x_var->GetMutable<framework::LoDTensor>();
auto res_var = scope->Var("res");
res_var->GetMutable<framework::LoDTensor>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
......@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t* ids_ptr =
ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
float* x_ptr =
x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
......@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
server_thread.join();
}
void StartSendAndRecvServer(const std::string& rpc_name) {
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
auto block = AppendSendAndRecvBlock(&program);
std::string in_var_name("x");
std::vector<int> prefetch_block_ids{block->ID()};
auto prepared = exe.Prepare(program, prefetch_block_ids);
InitTensorsOnServer(&scope, &place, 10);
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
grad_to_prepared_ctx;
grad_to_prepared_ctx[in_var_name] = prepared[0];
g_req_handler->SetProgram(&program);
g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
g_req_handler->SetDevCtx(&ctx);
g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread(
std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join();
}
TEST(COMPLETE, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
......@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
TEST(SENDANDRECV, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
distributed::DistributedMode::kAsync));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
PADDLE_ENFORCE_NE(client, nullptr,
platform::errors::InvalidArgument(
"Client Start Fail, Check Your Code & Env"));
std::thread server_thread(StartSendAndRecvServer,
distributed::kRequestSendAndRecv);
g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort();
std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
framework::Scope scope;
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// create var on local scope
int64_t rows_numel = 10;
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("x");
std::string out_var_name("res");
client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
client->Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[i], 0.5);
}
g_rpc_service->ShutDown();
server_thread.join();
LOG(INFO) << "begin reset";
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
......@@ -29,7 +29,7 @@ service SendRecvService {
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
}
......
......@@ -96,6 +96,13 @@ class VariableResponse {
return scope_->FindVar(meta_.varname());
}
framework::Variable* GetRecvVar() {
if (create_scope_) {
return local_scope_->Var(meta_.out_varname());
}
return scope_->FindVar(meta_.out_varname());
}
int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
protected:
......
......@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs("Ids"),
"Input(Ids) of LookupTableOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"),
"Input(W) of LookupTableOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
"Output(Outs) of LookupTableOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
platform::errors::InvalidArgument(
"Input(Ids) of LookupTableOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
platform::errors::InvalidArgument(
"Input(W) of LookupTableOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
platform::errors::InvalidArgument(
"Output(Outs) of LookupTableOp should not be null."));
auto ids_dims = ctx->GetInputsDim("Ids");
auto table_dims = ctx->GetInputDim("W");
PADDLE_ENFORCE_EQ(table_dims.size(), 2,
"Only 2 dimensions of the 'Embedding' is supported.");
PADDLE_ENFORCE_EQ(
table_dims.size(), 2,
platform::errors::InvalidArgument(
"Only 2 dimensions of the 'Embedding' is supported."));
for (auto &ids_dim : ids_dims) {
PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
"The dimension of the 'Ids' tensor must be 2.");
platform::errors::InvalidArgument(
"The dimension of the 'Ids' tensor must be 2."));
}
auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
// for fluid.embedding
auto lookup_table_version =
ctx->Attrs().Get<std::string>("lookup_table_version");
......
......@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
auto is_distributed = context.Attr<bool>("is_distributed");
auto lookup_table_version =
context.Attr<std::string>("lookup_table_version");
operators::distributed::prefetchs(id_names, out_names, embedding_name,
is_distributed, lookup_tables, endpoints,
context, context.scope());
if (lookup_table_version == "lookup_table_v2") {
auto &scope = context.scope();
auto emb_dim =
scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
for (size_t i = 0; i < id_names.size(); ++i) {
auto *id_var = scope.FindVar(id_names[i]);
auto *out_var = scope.FindVar(out_names[i]);
auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto id_dims = id_tensor->dims();
out_tensor->Resize(framework::make_ddim(
{static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
static_cast<int64_t>(emb_dim)}));
}
}
}
};
......
......@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks");
std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid);
......@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
while (true) {
if (rpc_service_->IsExit()) {
......@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
new distributed::RequestGetNoBarrierHandler());
request_notify_handler_.reset(
new distributed::RequestNotifyHandler(distributed_mode, fan_in));
request_send_and_recv_handler_.reset(
new distributed::RequestSendAndRecvHandler(distributed_mode));
rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get(), rpc_send_thread_num);
......@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
request_get_no_barrier_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestNotify,
request_notify_handler_.get(), rpc_send_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
request_send_and_recv_handler_.get(),
rpc_get_thread_num);
auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
......@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
"optimize blocks is less than 1. Optimize blocks "
"should be 1 at least on the pserver side."));
auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place);
std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
......@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
f(request_checkpoint_handler_.get());
f(request_get_no_barrier_handler_.get());
f(request_notify_handler_.get());
f(request_send_and_recv_handler_.get());
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
signal(SIGINT, SignalHandler::StopAndExit);
......
......@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
mutable std::shared_ptr<distributed::RequestHandler>
request_checkpoint_handler_;
mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
mutable std::shared_ptr<distributed::RequestHandler>
request_send_and_recv_handler_;
mutable std::shared_ptr<std::thread> server_thread_;
mutable std::vector<std::string> sparse_vars_;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/operators/distributed/communicator_common.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SendAndRecvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope();
const auto& place = ctx.GetPlace();
auto send_var_name = ctx.Attr<std::string>("send_var_name");
auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
auto epmap = ctx.Attr<std::string>("endpoint");
auto trainer_id = ctx.Attr<int>("trainer_id");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& context = *pool.Get(place);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
<< " Recv_var_name: " << recv_var_name;
distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
epmap, context, scope, send_var_name, recv_var_name);
rets->Wait();
}
};
class SendAndRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
AddAttr<std::string>("send_var_name", "Send Tensor's name")
.SetDefault(std::string(""));
AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
.SetDefault(std::string(""));
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::string>("endpoint", "Server endpoint")
.SetDefault({"127.0.0.1:6164"});
AddComment(R"DOC(
SendAndRecv operator
This operator will send variables to listen_and_serve op at the parameter server.
And recv variable from parameter server of send variable's scope.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
REGISTER_OP_CPU_KERNEL(
send_and_recv,
ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
......@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
auto out_vars = context.MultiOutputVar("Out");
for (size_t i = 0; i < out_var_names.size(); i++) {
VLOG(4) << "loading tensor: " << out_var_names[i];
PADDLE_ENFORCE_NOT_NULL(
out_vars[i], platform::errors::InvalidArgument(
"The variable %s to be loaded cannot be found.",
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h"
#include <memory>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle {
......@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
ops::LookupTableV2GradKernel<float>,
ops::LookupTableV2GradKernel<double>);
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(lookup_table_v2)
.AddCheckpoint(
R"ROC(fix lookup_table_v2, add input type `int32`)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged("lookup_table_v2 support input type "
"`int64`; after support input type "
"`int32/int64`"));
/* ========================================================================== */
......@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
}
}
template <typename T>
__global__ void InputTypeCovert(const T *in_ids, const int64_t K,
int64_t *out_ids) {
for (int i = 0; i < K; i++) {
out_ids[i] = (int64_t)(in_ids[i]);
}
}
template <typename T>
class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
public:
......@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
size_t D = table_t->dims()[1];
size_t K = ids_t->numel();
auto *ids = ids_t->data<int64_t>();
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
dim3 threads(256, 4);
dim3 grids(80, 1);
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> ids;
ids.resize(K);
const int64_t *ids_p = nullptr;
if (ids_t->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
ids_p = ids.MutableData(context.GetPlace());
} else {
ids_p = ids_t->data<int64_t>();
}
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
if (padding_idx == -1)
LookupTableV2<
T, 256, 4, 80,
false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
output, table, ids_p, N, K, D, padding_idx);
else
LookupTableV2<
T, 256, 4, 80,
true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
output, table, ids_p, N, K, D, padding_idx);
}
};
......@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
dim3 threads(128, 8);
dim3 grids(8, 1);
auto stream = dev_ctx.stream();
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows;
new_rows.resize(ids_num);
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
// TODO(yuyang18): Strange code here.
memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
if (ids->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids->data<int>(), ids_num,
new_rows.MutableData(context.GetPlace()));
} else {
memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
}
d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value();
......@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
int N = d_table_t->dims()[0];
int D = d_table_t->dims()[1];
int K = ids_t->numel();
const int64_t *ids = ids_t->data<int64_t>();
dim3 threads(128, 8);
dim3 grids(8, 1);
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> ids;
ids.resize(K);
const int64_t *ids_p = nullptr;
if (ids_t->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
ids_p = ids.MutableData(context.GetPlace());
} else {
ids_p = ids_t->data<int64_t>();
}
const T *d_output = d_output_t->data<T>();
T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
dim3 threads(128, 8);
dim3 grids(8, 1);
LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids, N, K, D);
d_table, d_output, ids_p, N, K, D);
}
}
};
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
......@@ -45,84 +46,70 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
auto *output_t = context.Output<LoDTensor>("Out"); // float tensor
auto *table_var = context.InputVar("W");
auto id_name = context.InputNames("Ids").front();
auto embedding_name = context.InputNames("W").front();
auto out_name = context.OutputNames("Out").front();
// for remote prefetch
auto epmap = context.Attr<std::vector<std::string>>("epmap");
auto remote_prefetch = context.Attr<bool>("remote_prefetch");
auto table_names = context.Attr<std::vector<std::string>>("table_names");
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int64_t ids_numel = ids_t->numel();
if (remote_prefetch && !epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter server
std::vector<int64_t> ids;
ids.reserve(ids_numel);
#ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch(id_name, out_name, embedding_name, false,
table_names, epmap, context,
context.scope());
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
if (ids_t->type() == framework::proto::VarType::INT32) {
std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_numel,
std::back_inserter(ids),
[&](int id) { return static_cast<int64_t>(id); });
} else {
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
int64_t ids_numel = ids_t->numel();
if (table_var->IsType<LoDTensor>()) {
auto *table_t = context.Input<LoDTensor>("W");
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_LT(
ids[i], row_number,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number, ids[i]);
PADDLE_ENFORCE_GE(
ids[i], 0,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number, ids[i]);
memcpy(output + i * row_width, table + ids[i] * row_width,
row_width * sizeof(T));
}
framework::TensorToVector(*ids_t, &ids);
}
if (table_var->IsType<LoDTensor>()) {
auto *table_t = context.Input<LoDTensor>("W");
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_LT(
ids[i], row_number,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number, ids[i]);
PADDLE_ENFORCE_GE(
ids[i], 0,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number, ids[i]);
memcpy(output + i * row_width, table + ids[i] * row_width,
row_width * sizeof(T));
}
} else if (table_var->IsType<SelectedRows>()) {
const auto &table_t = table_var->Get<SelectedRows>();
int64_t row_width = table_t.value().dims()[1];
const auto *table = table_t.value().data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_GE(
ids[i], 0,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0. But received %ld",
ids[i]);
auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE(
id_index, 0, "the input key should be exists. But received %d.",
id_index);
blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width);
}
}
} else if (table_var->IsType<SelectedRows>()) {
const auto &table_t = table_var->Get<SelectedRows>();
int64_t row_width = table_t.value().dims()[1];
const auto *table = table_t.value().data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_GE(
ids[i], 0,
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0. But received %ld",
ids[i]);
auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE(id_index, 0,
"the input key should be exists. But received %d.",
id_index);
blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width);
}
}
}
......@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
auto *ids = context.Input<LoDTensor>("Ids");
auto *ids_t = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
int64_t ids_num = ids_t->numel();
std::vector<int64_t> ids;
ids.reserve(ids_num);
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
if (ids_t->type() == framework::proto::VarType::INT32) {
std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
std::back_inserter(ids),
[&](int id) { return static_cast<int64_t>(id); });
} else {
framework::TensorToVector(*ids_t, &ids);
}
std::vector<int64_t> new_rows;
new_rows.resize(ids_num);
std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
d_table->set_rows(new_rows);
d_table->set_rows(ids);
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
......@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
} else {
auto *ids = context.Input<LoDTensor>("Ids");
auto *ids_t = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
int64_t ids_num = ids_t->numel();
std::vector<int64_t> ids;
ids.reserve(ids_num);
if (ids_t->type() == framework::proto::VarType::INT32) {
std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
std::back_inserter(ids),
[&](int id) { return static_cast<int64_t>(id); });
} else {
framework::TensorToVector(*ids_t, &ids);
}
auto *ids_data = ids->data<int64_t>();
auto *ids_data = ids.data();
int64_t N = table_dim[0];
int64_t D = table_dim[1];
......@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
memset(d_table_data, 0, d_table->numel() * sizeof(T));
for (int64_t i = 0; i < ids->numel(); ++i) {
for (int64_t i = 0; i < ids_num; ++i) {
if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
// the gradient of padding_idx should be 0, already done by memset, so
// do nothing.
......
......@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using Tensor = framework::Tensor;
static inline int SizeOutAxis(const int axis, DDim dims) {
int size = 1;
for (int i = axis + 1; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
template <typename T>
class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out");
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
math::SoftmaxCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(),
&flattened_x, &flattened_out);
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto* out_data = out->data<T>();
auto dims = x->dims();
const int rank = dims.size();
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
const int dim = dims[axis];
const int N = SizeToAxis(axis, dims);
const int D = SizeOutAxis(axis, dims);
ScopedTensorDescriptor desc;
std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW;
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data));
}
};
template <typename T>
class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* Out = context.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
math::SoftmaxGradCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(),
&flattened_out, &flattened_d_out, &flattened_d_x);
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
auto* dx_data = dx->data<T>();
auto dims = out->dims();
const int rank = dims.size();
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
const int dim = dims[axis];
const int N = SizeToAxis(axis, dims);
const int D = SizeOutAxis(axis, dims);
ScopedTensorDescriptor desc;
std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW;
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
}
};
......
......@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"Attr(axis) value should be in range [-R, R-1], "
"R is the rank of Input(X)."));
auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
if (axis != rank_x - 1 && axis != -1) {
PADDLE_ENFORCE_EQ(use_cudnn, false,
platform::errors::InvalidArgument(
"CUDNN kernel only support axis as -1."));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
......
......@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/unsqueeze_op.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
......@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
REGISTER_OP_CPU_KERNEL(
unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL(
unsqueeze_grad,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -57,7 +57,11 @@ void BindFleetWrapper(py::module* m) {
.def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold)
.def("cache_shuffle", &framework::FleetWrapper::CacheShuffle)
.def("save_cache", &framework::FleetWrapper::SaveCache)
.def("save_model_with_whitelist",
&framework::FleetWrapper::SaveWithWhitelist)
.def("load_model", &framework::FleetWrapper::LoadModel)
.def("load_table_with_whitelist",
&framework::FleetWrapper::LoadWithWhitelist)
.def("clear_model", &framework::FleetWrapper::ClearModel)
.def("clear_one_table", &framework::FleetWrapper::ClearOneTable)
.def("stop_server", &framework::FleetWrapper::StopServer)
......
......@@ -334,8 +334,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
} while (0)
static void RegisterGlobalVarGetterSetter() {
REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_use_mkldnn,
FLAGS_free_idle_chunk,
REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
FLAGS_free_when_no_cache_hit);
REGISTER_PUBLIC_GLOBAL_VAR(
......@@ -349,7 +348,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads);
FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
#ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR(
......
......@@ -448,7 +448,6 @@ void BindAnalysisConfig(py::module *m) {
&AnalysisConfig::cpu_math_library_num_threads)
.def("to_native_config", &AnalysisConfig::ToNativeConfig)
.def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
#ifdef PADDLE_WITH_MKLDNN
.def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
py::return_value_policy::reference)
......@@ -566,7 +565,6 @@ void BindPaddlePassBuilder(py::module *m) {
.def("enable_cudnn", &PassStrategy::EnableCUDNN)
.def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
.def("use_gpu", &PassStrategy::use_gpu);
py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
......@@ -574,16 +572,14 @@ void BindPaddlePassBuilder(py::module *m) {
.def(py::init<const CpuPassStrategy &>())
.def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
.def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
.def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
.def(py::init<>())
.def(py::init<const GpuPassStrategy &>())
.def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
.def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
.def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
}
} // namespace
} // namespace pybind
......
......@@ -42,7 +42,6 @@ requirements:
- nltk
- scipy
- requests
- pyyaml
- pillow
- graphviz
- protobuf
......@@ -62,7 +61,6 @@ requirements:
- nltk
- scipy
- requests
- pyyaml
- pillow
- graphviz
- protobuf
......@@ -89,13 +87,11 @@ about:
pip install /package/objgraph-3.4.1.tar.gz
pip install /package/prettytable-0.7.tar.gz
pip install /package/rarfile-3.0.tar.gz --no-deps
pip install /package/funcsigs-1.0.2.tar.gz
"""
self.blt_const = r"""
pip install C:\package\objgraph-3.4.1.tar.gz
pip install C:\package\prettytable-0.7.tar.gz
pip install C:\package\funcsigs-1.0.2.tar.gz
pip install C:\package\rarfile-3.0.tar.gz --no-deps
git clone https://github.com/PaddlePaddle/recordio.git
cd recordio\python
......
......@@ -19,10 +19,14 @@ rem =================================================
rem Paddle CI Task On Windows Platform
rem =================================================
rem -------clean up environment-----------
set work_dir=%cd%
if exist build rmdir build /s/q
mkdir build
cd /d build
tree .
dir paddle\fluid\pybind\Release
taskkill /f /im op_function_generator.exe 2>NUL
rem ------initialize the virtual environment------
if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
......@@ -59,13 +63,12 @@ if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem ------set cache third_party------
set cache_dir=%work_dir%\..\cache
set cache_dir=%work_dir:Paddle=cache%
dir %cache_dir%
set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
if not exist %cache_dir%\tools (
git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
)
if "%WITH_TPCACHE%"=="OFF" (
......@@ -125,6 +128,8 @@ echo ========================================
echo Step 1. Cmake ...
echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set start=%start:~4,10%
echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
......@@ -150,7 +155,7 @@ call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd6
set build_times=1
:build_tp
echo Build third_party for %build_times% time:
echo Build third_party the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1
......@@ -165,7 +170,7 @@ echo Build third_party successfully!
set build_times=1
:build_paddle
echo Build Paddle for %build_times% time:
echo Build Paddle the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1
......@@ -176,7 +181,9 @@ if %ERRORLEVEL% NEQ 0 (
goto :build_paddle
)
)
echo Build Paddle successfully!
goto:eof
:build_error
......@@ -189,6 +196,17 @@ rem ----------------------------------------------------------------------------
echo ========================================
echo Step 3. Test pip install whl package ...
echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10%
call :timestamp "%start%" "%end%" "Build"
tree /F %cd%\fluid_inference_install_dir\paddle
%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\fluid_inference_install_dir\paddle\lib > lib_size.txt
set /p libsize=< lib_size.txt
for /F %%i in ("%libsize%") do echo "Windows FLuid_Inference Size: %%i"
%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
set /p whlsize=< whl_size.txt
for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
dir /s /b python\dist\*.whl > whl_file.txt
set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
......@@ -215,6 +233,8 @@ echo ========================================
echo Step 4. Running unit tests ...
echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set start=%start:~4,10%
dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
......@@ -237,8 +257,11 @@ echo ========================================
echo Step 5. Testing fluid library for inference ...
echo ========================================
cd %work_dir%\paddle\fluid\inference\api\demo_ci
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10%
call :timestamp "%start%" "%end%" "TestCases Total"
cd %work_dir%\paddle\fluid\inference\api\demo_ci
%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
goto:eof
......@@ -253,7 +276,6 @@ echo ========================================
echo Step 6. Check whether deleting a unit test ...
echo ========================================
set PATH=%PYTHON_ROOT%;%PATH%
cd /d %work_dir%\build
echo set -ex> check_change_of_unittest.sh
echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh
......@@ -325,6 +347,43 @@ call paddle_winci\Scripts\deactivate.bat 2>NUL
exit /b 1
:timestamp
echo on
setlocal enabledelayedexpansion
set start=%~1
set dd=%start:~2,2%
set /a dd=100%dd%%%100
set hh=%start:~4,2%
set /a hh=100%hh%%%100
set nn=%start:~6,2%
set /a nn=100%nn%%%100
set ss=%start:~8,2%
set /a ss=100%ss%%%100
set /a start_sec=dd*86400+hh*3600+nn*60+ss
echo %start_sec%
set end=%~2
set dd=%end:~2,2%
set /a dd=100%dd%%%100
if %start:~0,2% NEQ %end:~0,2% (
set month_day=0
for %%i in (01 03 05 07 08 10 12) DO if %%i EQU %start:~0,2% set month_day=31
for %%i in (04 06 09 11) DO if %%i EQU %start:~0,2% set month_day=30
for %%i in (02) DO if %%i EQU %start:~0,2% set month_day=28
set /a dd=%dd%+!month_day!
)
set hh=%end:~4,2%
set /a hh=100%hh%%%100
set nn=%end:~6,2%
set /a nn=100%nn%%%100
set ss=%end:~8,2%
set /a ss=100%ss%%%100
set /a end_secs=dd*86400+hh*3600+nn*60+ss
set /a cost_secs=end_secs-start_sec
echo "Windows %~3 Time: %cost_secs%s"
goto:eof
rem ---------------------------------------------------------------------------------------------
:success
echo ========================================
......@@ -340,7 +399,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
taskkill /f /im vctip.exe 2>NUL
taskkill /f /im cvtres.exe 2>NUL
taskkill /f /im rc.exe 2>NUL
taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe 2>NUL
taskkill /f /im op_function_generator.exe 2>NUL
taskkill /f /im python.exe 2>NUL
call paddle_winci\Scripts\deactivate.bat 2>NUL
taskkill /f /im python.exe 2>NUL
......
......@@ -959,7 +959,7 @@ set +x
retry_unittests_record="$retry_unittests_record$failed_test_lists"
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
echo "========================================="
echo "This is the ${exec_time_array[$exec_times]} time to re-run"
echo "========================================="
......@@ -1395,6 +1395,26 @@ function example() {
fi
}
function summary_check_problems() {
set +x
local check_style_code=$1
local example_code=$2
if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
echo "========================================"
echo "summary problems:"
echo "========================================"
if [ $check_style_code -ne 0 ];then
echo "- Check code style failed! Please check the log and fix problems."
fi
if [ $example_code -ne 0 ];then
echo "- Check example code failed! Please check the log and fix problems."
fi
[ $check_style_code -ne 0 ] && exit $check_style_code
[ $example_code -ne 0 ] && exit $example_code
fi
set -x
}
function main() {
local CMD=$1
local parallel_number=$2
......@@ -1407,12 +1427,15 @@ function main() {
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
;;
build_and_check)
check_style
$(check_style >&2)
check_style_code=$?
generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
check_sequence_op_unittest
generate_api_spec ${PYTHON_ABI:-""} "PR"
example
$(example >&2)
example_code=$?
summary_check_problems $check_style_code $example_code
assert_api_spec_approvals
;;
build)
......
......@@ -95,7 +95,6 @@ if (WITH_TESTING)
add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests)
add_subdirectory(paddle/fluid/contrib/slim/tests)
add_subdirectory(paddle/incubate/hapi/tests)
endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels
......
......@@ -256,12 +256,16 @@ from .device import get_device
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS
from . import incubate
from .incubate import hapi
from .fluid.dygraph.base import enable_dygraph as disable_static #DEFINE_ALIAS
from .fluid.dygraph.base import disable_dygraph as enable_static #DEFINE_ALIAS
from .fluid.framework import in_dygraph_mode as in_dynamic_mode #DEFINE_ALIAS
from .fluid.dygraph.base import no_grad #DEFINE_ALIAS
from .fluid.dygraph.base import no_grad_ as no_grad #DEFINE_ALIAS
from . import jit
from . import static
# high-level api
from .hapi import Model
from .hapi import callbacks
import paddle.text
import paddle.vision
......@@ -196,3 +196,14 @@ def cluster_files_reader(files_pattern,
yield line
return reader
def _check_exists_and_download(path, url, md5, module_name, download=True):
if path and os.path.exists(path):
return path
if download:
return paddle.dataset.common.download(url, module_name, md5)
else:
raise ValueError('{} not exists and auto download disabled'.format(
path))
......@@ -36,7 +36,7 @@ import tarfile
import gzip
from collections import defaultdict
import paddle.dataset.common
import paddle
import paddle.compat as cpt
__all__ = [
......
......@@ -13,9 +13,11 @@
# limitations under the License.
# TODO: define the functions to manipulate devices
import re
from paddle.fluid import core
from paddle.fluid import framework
import re
from paddle.fluid.dygraph.parallel import ParallelEnv
__all__ = [
'get_cudnn_version',
......@@ -81,8 +83,8 @@ def set_device(device):
.. code-block:: python
import paddle
paddle.enable_imperative()
paddle.fluid.dygraph.set_device("gpu:0")
paddle.disable_static()
paddle.set_device("cpu")
x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
data = paddle.stack([x1,x2], axis=1)
......@@ -90,18 +92,28 @@ def set_device(device):
lower_device = device.lower()
if lower_device == 'cpu':
place = core.CPUPlace()
framework._set_expected_place(place)
elif lower_device == 'gpu':
if not core.is_compiled_with_cuda():
raise ValueError(
"The device should not be 'gpu', " \
"since PaddlePaddle is not compiled with CUDA")
place = core.CUDAPlace(ParallelEnv().dev_id)
else:
avaliable_device = ((lower_device == 'cpu') or
re.match(r'gpu:\d+', lower_device))
avaliable_device = re.match(r'gpu:\d+', lower_device)
if not avaliable_device:
raise ValueError(
"The device must be a string which is like 'cpu' or 'gpu:0'")
"The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'"
)
if not core.is_compiled_with_cuda():
raise ValueError(
"The device should not be {}, since PaddlePaddle is " \
"not compiled with CUDA".format(avaliable_device))
device_info_list = device.split(':', 1)
device_id = device_info_list[1]
device_id = int(device_id)
place = core.CUDAPlace(device_id)
framework._set_expected_place(place)
framework._set_expected_place(place)
return place
def get_device():
......@@ -116,8 +128,8 @@ def get_device():
.. code-block:: python
import paddle
paddle.enable_imperative()
device = paddle.fluid.dygraph.get_device()
paddle.disable_static()
device = paddle.get_device()
"""
device = ''
......
......@@ -73,20 +73,21 @@ def broadcast(tensor, src, group=0):
Examples:
.. code-block:: python
import paddle
import paddle.prepare_context as prepare_context
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
if paddle.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.broadcast(data, 1)
out = data.numpy()
# [[1, 2, 3], [1, 2, 3]]
import numpy as np
import paddle
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.broadcast(data, 1)
out = data.numpy()
# [[1, 2, 3], [1, 2, 3]]
"""
if in_dygraph_mode():
return core.ops.c_broadcast(tensor, tensor, 'root', src,
......@@ -129,21 +130,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
Examples:
.. code-block:: python
import paddle
from paddle.distributed import ReduceOp
import paddle.prepare_context as prepare_context
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
if paddle.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.all_reduce(data)
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
import numpy as np
import paddle
from paddle.distributed import ReduceOp
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.all_reduce(data)
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
"""
if in_dygraph_mode():
if op == ReduceOp.SUM:
......@@ -204,20 +206,21 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
Examples:
.. code-block:: python
import paddle
import paddle.prepare_context as prepare_context
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
if paddle.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.reduce(data, 0)
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
import numpy as np
import paddle
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]])
else:
np_data = np.array([[1, 2, 3], [1, 2, 3]])
data = paddle.to_tensor(np_data)
paddle.distributed.reduce(data, 0)
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
"""
if in_dygraph_mode():
if op == ReduceOp.SUM:
......@@ -286,25 +289,26 @@ def all_gather(tensor_list, tensor, group=0):
Examples:
.. code-block:: python
import paddle
import paddle.prepare_context as prepare_context
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
tensor_list = []
if paddle.ParallelEnv().local_rank == 0:
np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
paddle.distributed.all_gather(tensor_list, data1)
else:
np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
out = paddle.distributed.all_gather(tensor_list, data2)
import numpy as np
import paddle
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
tensor_list = []
if paddle.distributed.ParallelEnv().local_rank == 0:
np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
paddle.distributed.all_gather(tensor_list, data1)
else:
np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
paddle.distributed.all_gather(tensor_list, data2)
"""
op_type = 'c_allgather'
helper = LayerHelper(op_type, **locals())
......@@ -359,25 +363,26 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
Examples:
.. code-block:: python
import paddle
import paddle.prepare_context as prepare_context
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
if paddle.ParallelEnv().local_rank == 0:
np_data1 = np.array([7, 8, 9])
np_data2 = np.array([10, 11, 12])
else:
np_data1 = np.array([1, 2, 3])
np_data2 = np.array([4, 5, 6])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
if paddle.ParallelEnv().local_rank == 0:
paddle.distributed.scatter(data1, src=1)
else:
paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
out = data1.numpy()
import numpy as np
import paddle
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
if paddle.distributed.ParallelEnv().local_rank == 0:
np_data1 = np.array([7, 8, 9])
np_data2 = np.array([10, 11, 12])
else:
np_data1 = np.array([1, 2, 3])
np_data2 = np.array([4, 5, 6])
data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2)
if paddle.distributed.ParallelEnv().local_rank == 0:
paddle.distributed.scatter(data1, src=1)
else:
paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
out = data1.numpy()
"""
op_type = 'c_scatter'
global _default_group
......@@ -425,13 +430,13 @@ def barrier(group=0):
Examples:
.. code-block:: python
import paddle
import paddle.prepare_context as prepare_context
import paddle
from paddle.distributed import init_parallel_env
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
prepare_context()
paddle.distributed.barrier()
paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
init_parallel_env()
paddle.distributed.barrier()
"""
op_type = 'barrier'
temp = paddle.fill_constant([1], dtype="int32", value="1")
......
......@@ -22,8 +22,6 @@ from .runtime_factory import RuntimeFactory
from .util_factory import UtilFactory
from paddle.fluid.wrapped_decorator import wrap_decorator
#__all__ = ['Fleet']
def _inited_runtime_handler_(func):
def __impl__(*args, **kwargs):
......@@ -43,65 +41,123 @@ inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
class Fleet(object):
"""
Unified API for distributed training of PaddlePaddle
Please reference the https://github.com/PaddlePaddle/Fleet for details
Please reference the https://github.com/PaddlePaddle/FleetX for details
Returns:
Fleet: A Fleet instance
Examples:
Example for collective training:
.. code-block:: python
import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
fleet.init(is_collective=True)
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
# do distributed training
Example for parameter server training:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
if fleet.is_first_worker():
print("this is first worker")
print("current node index: {}".format(fleet.worker_index()))
print("total number of worker num: {}".format(fleet.worker_num()))
if fleet.is_worker():
print("this is worker")
print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
print("server num: {}".format(fleet.server_num()))
print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
if fleet.is_server():
print("this is server")
fleet.stop_worker()
"""
def __init__(self):
self._runtime_handle = None
self._util = None
self._role_maker = None
self.strategy_compiler = None
self._is_collective = False
self._runtime_handle = None
self._util = None
def init(self, role_maker=None, is_collective=False):
"""
Initialize role_maker in Fleet.
This function is responsible for the distributed architecture
what you want to run your code behind,such as Transpiler,
Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker
This function is responsible for the distributed architecture
what you want to run your code behind.
Args:
role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
of environment variables related to distributed training.If you did not initialize
the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
The default value is None.
is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
runs on the CPU or GPU. False means set distributed training using CPU, and True means
GPU.The default value is False.The default value is False.
Returns:
None
Examples1:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
Examples2:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init(is_collective=True)
Examples3:
.. code-block:: python
import paddle.distributed.fleet as fleet
role = fleet.PaddleCloudRoleMaker
fleet.init(role)
"""
if isinstance(role_maker, RoleMakerBase):
self._role_maker = role_maker
elif role_maker == None:
if role_maker is None:
if isinstance(is_collective, bool):
self._is_collective = is_collective
self._role_maker = PaddleCloudRoleMaker(
is_collective=self._is_collective)
else:
raise ValueError(
"Something wrong occurred, please check whether is_collective is bool value"
)
"`is_collective` should be instance of `bool`, but got {}".
format(type(is_collective)))
else:
raise ValueError(
"Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase"
)
if isinstance(role_maker, RoleMakerBase):
self._role_maker = role_maker
else:
raise ValueError(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}".
format(type(role_maker)))
self.strategy_compiler = StrategyCompiler()
return None
......@@ -112,7 +168,15 @@ class Fleet(object):
Returns:
bool: True if this is the first node of worker,
False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_first_worker()
"""
return self._role_maker.is_first_worker()
......@@ -122,6 +186,14 @@ class Fleet(object):
Returns:
int: node id
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_index()
"""
return self._role_maker.worker_index()
......@@ -131,6 +203,14 @@ class Fleet(object):
Returns:
int: worker numbers
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_num()
"""
return self._role_maker.worker_num()
......@@ -141,15 +221,31 @@ class Fleet(object):
Returns:
bool: True if this is a node of worker,
False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_worker()
"""
return self._role_maker.is_worker()
def worker_endpoints(self, to_string=False):
"""
Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
Get current worker endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
Returns:
list/string: server endpoints
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_endpoints()
"""
'''
if to_string:
......@@ -165,6 +261,12 @@ class Fleet(object):
Returns:
int: server number
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_num()
"""
return len(self._role_maker.get_pserver_endpoints())
......@@ -174,6 +276,14 @@ class Fleet(object):
Returns:
int: node id
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_index()
"""
return self._role_maker.server_index()
......@@ -183,14 +293,20 @@ class Fleet(object):
Returns:
list/string: server endpoints
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_endpoints()
"""
'''
if to_string:
return ",".join(self._role_maker.get_pserver_endpoints())
else:
return self._role_maker.get_pserver_endpoints()
'''
return ["127.0.0.1:1001", "127.0.0.1:1002"]
def is_server(self):
"""
......@@ -199,14 +315,36 @@ class Fleet(object):
Returns:
bool: True if this is a node of server,
False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_server()
"""
return self._role_maker.is_server()
return self._role_maker.is_server(
) or self._role_maker._is_heter_worker()
@property
def util(self):
"""
Utility functions that can be used under certain runtime
return util
Returns:
UtilBase: instance of UtilBase, can use distributed ops/tools easily.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
util = fleet.util
files = ["1.log", "2.log", "3.log", "4.log"]
files = util.get_file_shard()
"""
return self._util
......@@ -214,41 +352,114 @@ class Fleet(object):
def util(self, util):
"""
Set Utility functions for userd-defined runtime
set util
Returns:
None
"""
self._util = util
def barrier_worker(self):
"""
barrier between workers
barrier all workers
Returns:
None
"""
self._role_maker.barrier_worker()
@inited_runtime_handler
def init_worker(self):
"""
init worker
initialize `Communicator` for parameter server training.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_worker()
"""
self._runtime_handle._init_worker()
@inited_runtime_handler
def init_server(self, *args, **kwargs):
"""
init server
init_server executor to initialize startup program,
if the `args` is not empty, it will run load_persistables for increment training.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
"""
self._runtime_handle._init_server(*args, **kwargs)
@inited_runtime_handler
def run_server(self):
"""
run server
run server will run pserver main program with executor.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
if fleet.is_server():
fleet.init_server()
"""
self._runtime_handle._run_server()
@inited_runtime_handler
def stop_worker(self):
"""
stop worker
stop `Communicator` and give training complete notice to parameter server.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
"""
self._runtime_handle._stop_worker()
......@@ -259,27 +470,98 @@ class Fleet(object):
target_vars,
main_program=None,
export_for_deployment=True):
"""
save inference model for inference.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
"""
self._runtime_handle._save_inference_model(
executor, dirname, feeded_var_names, target_vars, main_program,
export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None):
"""
saves all persistable variables from :code:`main_program` to
the folder :code:`dirname`. You can refer to
The :code:`dirname` is used to specify the folder where persistable variables
are going to be saved. If you would like to save variables in separate
files, set :code:`filename` None.
Args:
executor(Executor): The executor to run for saving persistable variables.
You can refer to :ref:`api_guide_executor_en` for
more details.
dirname(str, optional): The saving directory path.
When you need to save the parameter to the memory, set it to None.
main_program(Program, optional): The program whose persistbale variables will
be saved. Default: None.
Returns:
None
Examples:
.. code-block:: text
import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
fleet.init()
# build net
# fleet.distributed_optimizer(...)
exe = fluid.Executor(fluid.CPUPlace())
fleet.save_persistables(exe, "dirname", fluid.default_main_program())
"""
self._runtime_handle._save_persistables(executor, dirname, main_program)
def distributed_optimizer(self, optimizer, strategy=None):
"""
distirbuted_optimizer
Optimizer for distributed training.
For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
Which has basic Optimizer function and special features for distributed training.
Args:
optimizer(Optimizer): The executor to run for init server.
strategy(DistributedStrategy): Extra properties for distributed optimizer.
Returns:
Fleet instance with minimize interface like optimizers
Fleet: instance of fleet.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
"""
self.user_defined_optimizer = optimizer
if strategy == None:
......@@ -316,23 +598,25 @@ class Fleet(object):
``fetch_list`` before run, see details in ``Executor``.
Examples:
import paddle
import paddle.distributed.fleet as fleet
.. code-block:: python
fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh')
fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh')
prediction = paddle.layers.fc(input=[fc_2], size=label_dim, act='softmax')
cost = paddle.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.layers.mean(x=cost)
import paddle
import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax')
cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
# for more examples, please reference https://github.com/PaddlePaddle/Fleet
# for more examples, please reference https://github.com/PaddlePaddle/FleetX
"""
context = {}
......@@ -372,10 +656,10 @@ class Fleet(object):
can_not_apply_optimizer_list.append(opt)
# combine recalled meta optimizers to be a valid meta optimizer
meta_optimizer, graph_optimizer = \
self.strategy_compiler.generate_optimizer(
loss, self._role_maker, self.user_defined_optimizer,
self.user_defined_strategy, valid_optimizer_list,
valid_graph_optimizer_list)
self.strategy_compiler.generate_optimizer(
loss, self._role_maker, self.user_defined_optimizer,
self.user_defined_strategy, valid_optimizer_list,
valid_graph_optimizer_list)
valid_strategy = self.strategy_compiler._get_valid_strategy(
self.user_defined_strategy, can_not_apply_optimizer_list)
......
......@@ -14,6 +14,7 @@
"""Defination of Role Makers."""
import os
import numpy as np
import warnings
from multiprocessing import Process, Manager
import paddle.fluid as fluid
......@@ -23,6 +24,7 @@ import paddle.fluid as fluid
class Role:
WORKER = 1
SERVER = 2
HETER_WORKER = 3
class RoleMakerBase(object):
......@@ -40,6 +42,11 @@ class RoleMakerBase(object):
self._role = None
self._current_id = -1
# for heter parameter server mode
self._heter_trainer_endpoints = []
self._heter_trainer_device = "CPU"
self._is_heter_parameter_server_mode = False
self._node_type = None
self._node_type_comm = None
self._all_comm = None
......@@ -163,12 +170,58 @@ class RoleMakerBase(object):
"""
print("warning: RoleMakerBase does not have barrier worker.")
def _is_heter_worker(self):
"""
Return is_heter_worker() of current process
"""
warnings.warn("RoleMakerBase does not have function: _is_heter_worker.")
return False
def _heter_worker_num(self):
"""
Get current total heter-worker number.
Returns:
int: heter_worker number
"""
warnings.warn(
"RoleMakerBase does not have function: _heter_worker_num.")
return 0
def _get_heter_worker_endpoints(self):
"""
Returns:
string: all heter_trainers'endpoints
"""
assert self._heter_trainer_endpoints != []
return self._heter_trainer_endpoints
def _get_heter_worker_endpoint(self):
"""
Returns:
int: corresponding heter_trainer's endpoint
e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
"""
assert self._heter_trainer_endpoints != []
return self._heter_trainer_endpoints[(self._current_id + 1) %
self._heter_worker_num()]
def _get_heter_worker_device(self):
"""
Returns:
string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
"""
return self._heter_trainer_device.upper()
class PaddleCloudRoleMaker(RoleMakerBase):
def __init__(self, is_collective=False, **kwargs):
super(PaddleCloudRoleMaker, self).__init__()
self._is_collective = is_collective
self._init_gloo = False #default no init gloo
self._init_gloo = False # default no init gloo
self._kwargs = kwargs
self._role_is_generated = False
......@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
"""
get index of current node
"""
if self.is_server():
return self.server_index()
elif self.is_worker():
return self.worker_index()
return self._current_id
def worker_num(self):
"""
......@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self.generate_role()
return self._server_endpoints
def _heter_worker_num(self):
"""
get heter worker nums
"""
if not self._role_is_generated:
self.generate_role()
return self._heter_trainers_num
def _is_heter_worker(self):
"""
whether current process is heter worker
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.HETER_WORKER
def _get_rank(self):
"""
get current rank in all workers and pservers
......@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def _ps_env(self):
try:
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port), eg. 127.0.0.1:6001
self._server_endpoints = os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
# format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
"").split(",")
assert self._server_endpoints != ""
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
assert self._server_endpoints != ""
trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
training_role = os.environ["TRAINING_ROLE"]
if training_role not in ["TRAINER", "PSERVER"]:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
format(training_role))
# For heter parameter server env setting
heter_trainer_eplist = os.getenv(
"PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
None)
if heter_trainer_eplist and heter_trainer_device:
try:
heter_trainer_eplist = os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
except:
raise ValueError(
"Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
)
self._is_heter_parameter_server_mode = True
heter_trainers_num = len(heter_trainer_eplist)
current_node_device = heter_trainer_device.upper()
if current_node_device not in ["CPU", "GPU", "XPU"]:
raise ValueError(
"Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
format(heter_trainer_device))
self._heter_trainer_device = current_node_device
else:
self._is_heter_parameter_server_mode = False
heter_trainers_num = 0
if training_role == "TRAINER":
role = Role.WORKER
......@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase):
ip = os.environ["POD_IP"]
self._cur_endpoint = ip + ":" + port
current_id = self._server_endpoints.index(self._cur_endpoint)
elif training_role == "HETER_TRAINER":
role = Role.HETER_WORKER
cur_ip = os.environ["POD_IP"]
cur_port = os.environ["PADDLE_PORT"]
curr_endpoint = ":".join([cur_ip, cur_port])
current_id = heter_trainer_eplist.index(curr_endpoint)
else:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
except ValueError as ve:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
except ValueError as e:
raise ValueError(
"something wrong with PaddleCloud, please check environment")
"Something wrong with PaddleCloud, please check environment")
self._trainers_num = trainers_num
self._role = role
self._current_id = current_id
self._node_num = len(
set([x.split(':')[0] for x in self._worker_endpoints]))
self._heter_trainers_num = heter_trainers_num
self._heter_trainer_endpoints = heter_trainer_eplist
def _collective_env(self):
self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
......
......@@ -15,10 +15,10 @@ from .amp_optimizer import AMPOptimizer
from .recompute_optimizer import RecomputeOptimizer
from .gradient_merge_optimizer import GradientMergeOptimizer
from .graph_execution_optimizer import GraphExecutionOptimizer
from .async_optimizer import AsyncMetaOptimizer
from .parameter_server_optimizer import ParameterServerOptimizer
from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer
from .lars_optimizer import LarsOptimizer
from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer
from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
from .dgc_optimizer import DGCOptimizer
from .lamb_optimizer import LambOptimizer
......@@ -13,12 +13,12 @@
from paddle import fluid
from paddle.fluid import compiler
from .async_optimizer import AsyncMetaOptimizer
from .parameter_server_optimizer import ParameterServerOptimizer
class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
class ParameterServerGraphOptimizer(ParameterServerOptimizer):
def __init__(self, optimizer):
super(AsyncGraphExecutionOptimizer, self).__init__(optimizer)
super(ParameterServerGraphOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = []
......@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
if self.role_maker.is_server():
return False
if self.role_maker._is_heter_parameter_server_mode:
return False
return True
def _disable_strategy(self, dist_strategy):
......
......@@ -15,9 +15,9 @@ from paddle import fluid
from .meta_optimizer_base import MetaOptimizerBase
class AsyncMetaOptimizer(MetaOptimizerBase):
class ParameterServerOptimizer(MetaOptimizerBase):
def __init__(self, optimizer):
super(AsyncMetaOptimizer, self).__init__(optimizer)
super(ParameterServerOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = []
......@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
_startup = worker.init_from_server_pass(_startup, compiled_config)
_startup = worker.delet_extra_optimizes_pass(_startup,
compiled_config)
# for heter program
if self.role_maker._is_heter_parameter_server_mode:
from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
if self.role_maker._is_heter_worker():
# for heter worker
_main = heter_worker.split_heter_worker_ops_pass(
_main, compiled_config)
else:
# for default worker
_main = heter_worker.split_trainer_ops_pass(_main,
compiled_config)
# for startup change
_startup = heter_worker.delete_startup_useless_ops_var_pass(
_startup, _main, compiled_config)
else:
_main = worker.append_send_ops_pass(_main, compiled_config)
_startup = _startup
......@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
_origin_startup_program,
strategy, self.role_maker)
main_program, startup_program = \
self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \
else self._build_pserver_programs(compiled_config)
if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
main_program, startup_program = self._build_trainer_programs(
compiled_config)
elif self.role_maker.is_server():
main_program, startup_program = self._build_pserver_programs(
compiled_config)
loss.block.program = main_program
fluid.framework.switch_startup_program(startup_program)
......
......@@ -154,15 +154,16 @@ class ParameterServerRuntime(RuntimeBase):
kwargs["sparse_attrs"] = get_sparse_attrs()
return kwargs
from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
SyncStrategy, GeoStrategy
trainer_config = self.async_strategy.get_trainer_runtime_config()
lrs = _get_lr_ops(self.origin_main_program)
if len(lrs) > 0:
lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
if lrs:
kwargs = {"need_global_step": "1"}
else:
kwargs = {"need_global_step": "0"}
......@@ -196,6 +197,18 @@ class ParameterServerRuntime(RuntimeBase):
else:
warnings.warn("communicator has been initialized, skip")
def _get_executor(self):
if self.role_maker._is_heter_worker():
if self.role_maker._get_heter_worker_device() == "GPU":
gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
executor = Executor(fluid.CUDAPlace(gpu_id))
else:
raise ValueError("Not Support Device {}".format(
self.role_maker._get_heter_worker_device()))
else:
executor = fluid.Executor(fluid.CPUPlace())
return executor
def _init_server(self, *args, **kwargs):
if len(args) > 1:
raise ValueError("init server can only accept 1 args: `dirname`")
......@@ -204,9 +217,15 @@ class ParameterServerRuntime(RuntimeBase):
else:
model_dirname = None
executor = fluid.Executor(fluid.CPUPlace())
if self.role_maker._is_heter_worker():
self._init_worker()
executor = self._get_executor()
executor.run(fluid.default_startup_program())
if self.role_maker._is_heter_worker():
return
if not model_dirname:
return
......@@ -237,12 +256,12 @@ class ParameterServerRuntime(RuntimeBase):
# self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
def _run_server(self):
executor = fluid.Executor(fluid.CPUPlace())
executor = self._get_executor()
executor.run(fluid.default_main_program())
def _stop_worker(self):
self._communicator.stop()
executor = fluid.Executor(fluid.CPUPlace())
executor = self._get_executor()
executor.close()
def _get_optimizer_status(self, op, param_name):
......
......@@ -29,13 +29,13 @@ __all__ = ["init_parallel_env"]
ParallelStrategy = core.ParallelStrategy
def init_parallel_env(backend='nccl'):
def init_parallel_env():
"""
Initialize parallel training environments in dynamic mode.
Initialize parallel training environment in dynamic graph mode.
Args:
backend(str, optional): The backend to communication between multiple devices.
Now only support ``nccl`` . Default value is ``nccl`` .
.. note::
Now only supports initializing the GPU parallel training
environment and using NCCL for communication.
Returns:
None
......@@ -89,14 +89,12 @@ def init_parallel_env(backend='nccl'):
dist.spawn(train)
"""
# 1. input check
if not isinstance(backend, six.string_types):
raise TypeError("input `backend` type error, expected type is str, "
"but received type is %s." % type(backend))
if cpt.to_text(backend) != 'nccl':
raise ValueError(
"backend `%s` is not supported, now only supports `nccl` backend." %
backend)
# 1. gpu check
if not core.is_compiled_with_cuda():
raise NotImplementedError(
"Cannot initialize parallel environment in CPU-only version, now only "
"supports initializing the GPU parallel environment. Please recompile "
"or reinstall paddle with GPU support.")
# 2. check env
def _check_var_exists(var_name):
......@@ -112,30 +110,28 @@ def init_parallel_env(backend='nccl'):
_check_var_exists("PADDLE_TRAINERS_NUM")
_check_var_exists("PADDLE_TRAINER_ENDPOINTS")
# 3. init ParallelStrategy
# 3. init NCCL ParallelStrategy
strategy = ParallelStrategy()
if cpt.to_text(backend) == 'nccl':
if parallel_helper._is_parallel_ctx_initialized():
warnings.warn("The parallel environment has been initialized.")
strategy.nranks = ParallelEnv().world_size
strategy.local_rank = ParallelEnv().rank
strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
strategy.current_endpoint = ParallelEnv().current_endpoint
if strategy.nranks < 2:
return
# NOTE(chenweihang): [ why config global place here? ]
# the dygraph mode will be set to default mode,
# users will not call `dygraph.guard` or `enable_dygraph`
# directly, if they want to switch default place,
# they need to call a function to change default place,
# here just set correctly place to users
place = core.CUDAPlace(ParallelEnv().device_id)
_set_expected_place(place)
# init nccl context
parallel_helper._set_parallel_ctx(
core.NCCLParallelContext(strategy, place))
parallel_helper._init_parallel_ctx()
if parallel_helper._is_parallel_ctx_initialized():
warnings.warn("The parallel environment has been initialized.")
strategy.nranks = ParallelEnv().world_size
strategy.local_rank = ParallelEnv().rank
strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
strategy.current_endpoint = ParallelEnv().current_endpoint
if strategy.nranks < 2:
return
# NOTE(chenweihang): [ why config global place here? ]
# the dygraph mode will be set to default mode,
# users will not call `dygraph.guard` or `enable_dygraph`
# directly, if they want to switch default place,
# they need to call a function to change default place,
# here just set correctly place to users
place = core.CUDAPlace(ParallelEnv().device_id)
_set_expected_place(place)
# init nccl context
parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place))
parallel_helper._init_parallel_ctx()
def get_rank():
......@@ -163,7 +159,7 @@ def get_rank():
def get_world_size():
"""
The number of trainers (number of processes participating in current job).
Returns the number of trainers (number of processes participating in current job).
Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` .
The default value is 1.
......
......@@ -236,8 +236,6 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
func (function): The target function is called by spawned process.
This function need to be able to pickled, so it must be defined
at the top level of a module.
This function should be called as ``func(i, *args)``, ``i`` is
the process index and ``args`` contains other arguments as tuple.
args (tuple, optional): Arguments passed to ``func``.
nprocs (int, optional): Number of processed to start. Default: -1.
when nprocs is -1, the available device will be obtained from
......@@ -246,8 +244,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
CPU number is obtained from the environment variable CPU_NUM.
For example, export CPU_NUM=4, if the environment variable is not set,
the executor will add the variable to the environment variable and
set its value to 1.
the spawn method will add default value to the environment variable
and set its value to 1.
join (bool, optional): Perform a blocking join on all spawned processes.
Default: True.
daemon (bool, optional): The spawned processes' daemon flag. Default: False.
......@@ -266,8 +264,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
such as 6170. Default: None;
(5) selected_gpus (string): The training process will run on the
selected_gpus, such as "0,1,2,3". Default: None;
(6) print_config: Print current parallel training config. Default: False;
(7) use_paddlecloud: Whether to use paddlecloud platform to run your
(6) print_config (bool): Print current parallel training config. Default: False;
(7) use_paddlecloud (bool): Whether to use paddlecloud platform to run your
multi-process job. Default: False.
Returns:
......
......@@ -129,7 +129,7 @@ class GradientClipBase(object):
def __str__(self):
raise NotImplementedError()
@imperative_base.no_grad()
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
raise NotImplementedError
......@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
def __str__(self):
return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
@imperative_base.no_grad()
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
for p, g in params_grads:
......@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
def __str__(self):
return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
@imperative_base.no_grad()
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
for p, g in params_grads:
......@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
def __str__(self):
return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
@imperative_base.no_grad()
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
sum_square_list = []
......
......@@ -69,7 +69,7 @@ class ImperativeQuantAware(object):
from paddle.fluid.contrib.slim.quantization \
import ImperativeQuantAware
from paddle.incubate.hapi.vision.models \
from paddle.vision.models \
import resnet
model = resnet.resnet50(pretrained=True)
......
......@@ -16,10 +16,12 @@ from __future__ import print_function
from __future__ import division
import numpy as np
import math
from .sampler import Sampler, SequenceSampler, RandomSampler
from .dataset import Dataset, IterableDataset
__all__ = ["BatchSampler"]
__all__ = ["BatchSampler", "DistributedBatchSampler"]
class BatchSampler(Sampler):
......@@ -158,3 +160,185 @@ class _InfiniteIterableSampler(object):
def __iter__(self):
while True:
yield [None] * self.batch_size
class DistributedBatchSampler(BatchSampler):
"""Sampler that restricts data loading to a subset of the dataset.
In such case, each process can pass a DistributedBatchSampler instance
as a DataLoader sampler, and load a subset of the original dataset that
is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Args:
dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
or other python object which implemented
`__len__` for BatchSampler to get sample
number of data source.
batch_size(int): sample indice number in a mini-batch indices.
num_replicas(int, optional): porcess number in distributed training.
If :attr:`num_replicas` is None, :attr:`num_replicas` will be
retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
Default None.
rank(int, optional): the rank of the current process among :attr:`num_replicas`
processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
:code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
shuffle(bool): whther to shuffle indices order before genrating
batch indices. Default False.
drop_last(bool): whether drop the last incomplete batch dataset size
is not divisible by the batch size. Default False
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler:
# do something
break
"""
def __init__(self,
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False):
self.dataset = dataset
assert isinstance(batch_size, int) and batch_size > 0, \
"batch_size should be a positive integer"
self.batch_size = batch_size
assert isinstance(shuffle, bool), \
"shuffle should be a boolean value"
self.shuffle = shuffle
assert isinstance(drop_last, bool), \
"drop_last should be a boolean number"
from paddle.fluid.dygraph.parallel import ParallelEnv
if num_replicas is not None:
assert isinstance(num_replicas, int) and num_replicas > 0, \
"num_replicas should be a positive integer"
self.nranks = num_replicas
else:
self.nranks = ParallelEnv().nranks
if rank is not None:
assert isinstance(rank, int) and rank >= 0, \
"rank should be a non-negative integer"
self.local_rank = rank
else:
self.local_rank = ParallelEnv().local_rank
self.drop_last = drop_last
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
self.total_size = self.num_samples * self.nranks
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
if self.shuffle:
np.random.RandomState(self.epoch).shuffle(indices)
self.epoch += 1
# subsample
def _get_indices_by_batch_size(indices):
subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
assert last_batch_size % self.nranks == 0
last_local_batch_size = last_batch_size // self.nranks
for i in range(self.local_rank * self.batch_size,
len(indices) - last_batch_size,
self.batch_size * self.nranks):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
subsampled_indices.extend(indices[
self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
if self.nranks > 1:
indices = _get_indices_by_batch_size(indices)
assert len(indices) == self.num_samples
_sample_iter = iter(indices)
batch_indices = []
for idx in _sample_iter:
batch_indices.append(idx)
if len(batch_indices) == self.batch_size:
yield batch_indices
batch_indices = []
if not self.drop_last and len(batch_indices) > 0:
yield batch_indices
def __len__(self):
num_samples = self.num_samples
num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size
def set_epoch(self, epoch):
"""
Sets the epoch number. When :attr:`shuffle=True`, this number is used
as seeds of random numbers. By default, users may not set this, all
replicas (workers) use a different random ordering for each epoch.
If set same number at each epoch, this sampler will yield the same
ordering at all epoches.
Arguments:
epoch (int): Epoch number.
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10):
sampler.set_epoch(epoch)
"""
self.epoch = epoch
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
import inspect
import decorator
import contextlib
import functools
import inspect
import sys
import numpy as np
from paddle.fluid import core
......@@ -26,8 +27,8 @@ import objgraph
from ..data_feeder import convert_dtype
__all__ = [
'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled',
'to_variable'
'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
'enabled', 'to_variable'
]
......@@ -167,7 +168,80 @@ def disable_dygraph():
_functional_dygraph_context_manager = None
class no_grad:
@signature_safe_contextmanager
def _switch_tracer_mode_guard_(is_train=True):
tracer = framework._dygraph_tracer()
if tracer:
mode = tracer._train_mode
tracer._train_mode = is_train
try:
yield
finally:
tracer._train_mode = mode
else:
yield
def no_grad(func=None):
"""
:api_attr: imperative
Create a context which disables dygraph gradient calculation.
In this mode, the result of every computation will have `stop_gradient=True`.
Also functions as a decorator. (Make sure to instantiate without parenthesis.)
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
# use as generator
data = np.array([[2, 3], [4, 5]]).astype('float32')
with fluid.dygraph.guard():
l0 = fluid.Linear(2, 2) # l0.weight.gradient() is None
l1 = fluid.Linear(2, 2)
with fluid.dygraph.no_grad():
# l1.weight.stop_gradient is False
tmp = l1.weight * 2 # tmp.stop_gradient is True
x = fluid.dygraph.to_variable(data)
y = l0(x) + tmp
o = l1(y)
o.backward()
print(tmp.gradient() is None) # True
print(l0.weight.gradient() is None) # False
# use as decorator
@fluid.dygraph.no_grad
def test_layer():
with fluid.dygraph.guard():
inp = np.ones([3, 1024], dtype='float32')
t = fluid.dygraph.base.to_variable(inp)
linear1 = fluid.Linear(1024, 4, bias_attr=False)
linear2 = fluid.Linear(4, 4)
ret = linear1(t)
dy_ret = linear2(ret)
test_layer()
"""
if func is None:
return _switch_tracer_mode_guard_(is_train=False)
else:
@decorator.decorator
def __impl__(func, *args, **kwargs):
with _switch_tracer_mode_guard_(is_train=False):
return func(*args, **kwargs)
return __impl__(func)
class no_grad_:
"""
:api_attr: imperative
......
......@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False):
# NOTE: `jit.save` doesn't save optimizer state
else:
# Load state dict by `save_dygraph` save format
para_dict = {}
if os.path.exists(params_file_path):
with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load(
......
......@@ -16,9 +16,7 @@ import astor
import gast
from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api, is_to_variable
from paddle.fluid.dygraph.dygraph_to_static.utils import to_assign_node, to_static_ast, update_args_of_func
from paddle.fluid.dygraph.dygraph_to_static.utils import dygraph_class_to_static_api
from paddle.fluid.dygraph.dygraph_to_static import utils
class BasicApiTransformer(gast.NodeTransformer):
......@@ -56,7 +54,7 @@ class BasicApiTransformer(gast.NodeTransformer):
if isinstance(child_node, gast.Call):
# TODO(liym27):
# Considers that a dygraph api which modifies the input or has a output.
if is_dygraph_api(child_node):
if utils.is_dygraph_api(child_node):
return
else:
self._visit_Call(child_node)
......@@ -73,7 +71,7 @@ class BasicApiTransformer(gast.NodeTransformer):
if self._is_dygraph_forward(func_name):
class_node = self._get_class_node(func_name)
static_node = to_static_ast(node, class_node)
static_node = utils.to_static_ast(node, class_node)
return static_node
else:
return node
......@@ -91,14 +89,51 @@ class BasicApiTransformer(gast.NodeTransformer):
if is_to_variable(node_value):
return False
if is_dygraph_api(node_value):
if utils.is_dygraph_api(node_value):
dygraph_api = node_value.func.attr
if not dygraph_class_to_static_api.get(dygraph_api):
if not utils.dygraph_class_to_static_api.get(dygraph_api):
return False
update_args_of_func(node_value, node_value, "__init__")
utils.update_args_of_func(node_value, node_value, "__init__")
target_str = astor.to_source(gast.gast_to_ast(node.targets[0]))
self.class_node_dict[target_str] = node_value
return True
# TODO: node.value is not dygraph class
return False
def is_to_variable(node):
assert isinstance(node, gast.Call)
api_name = utils.ast_to_source_code(node.func).strip()
if utils.is_dygraph_api(node):
return api_name.endswith("to_variable")
if utils.is_paddle_api(node):
return api_name.endswith("to_tensor")
return False
def to_assign_node(node):
# Transform dygraph api `fluid.dygraph.to_variable` alias `paddle.to_tensor` to static api `fluid.layers.assign`.
# NOTE:
# 1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
# but api `assign` only supports {float32, float64, int32, int64, bool};
# 2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
assert isinstance(node, gast.Call)
assign_api = gast.parse('fluid.layers.assign').body[0].value
node.func = assign_api
if node.args:
node.args = [node.args[0]]
node.keywords = []
else:
for idx, kw in enumerate(node.keywords):
if kw.arg == 'value' or kw.arg == 'data':
node.keywords[idx].arg = 'input'
node.keywords = [node.keywords[idx]]
node.args = []
break
return node
......@@ -296,7 +296,7 @@ def convert_to_input_spec(inputs, input_spec):
elif isinstance(input_spec, dict):
input_with_spec = {}
check_type_and_len(inputs, input_spec, True)
for name, input in inputs.items():
for name, input in six.iteritems(inputs):
if name in input_spec:
input_with_spec[name] = convert_to_input_spec(input,
input_spec[name])
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import six
import copy
from collections import defaultdict
......@@ -230,7 +231,7 @@ class NameVisitor(gast.NodeVisitor):
return False
def _update_name_ids(self, new_name_ids):
for name_id, ctxs in new_name_ids.items():
for name_id, ctxs in six.iteritems(new_name_ids):
self.name_ids[name_id] = ctxs + self.name_ids[name_id]
......@@ -250,7 +251,7 @@ def parse_cond_args(var_ids_dict, return_ids=None, ctx=gast.Load):
"""
name_ids = [
var_id for var_id, var_ctx in var_ids_dict.items()
var_id for var_id, var_ctx in six.iteritems(var_ids_dict)
if isinstance(var_ctx[0], ctx)
]
if return_ids:
......@@ -341,7 +342,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
def _vars_with_store(ids_dict):
vars = []
for k, ctxs in ids_dict.items():
for k, ctxs in six.iteritems(ids_dict):
if _is_return_var(ctxs):
vars.append(k)
return vars
......@@ -353,7 +354,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
def _vars_loaded_before_store(ids_dict):
new_dict = defaultdict(list)
for k, ctxs in ids_dict.items():
for k, ctxs in six.iteritems(ids_dict):
for ctx in ctxs:
if isinstance(ctx, gast.Load):
new_dict[k].append(ctx)
......
......@@ -98,8 +98,15 @@ class TranslatorLogger(object):
return level == self.transformed_code_level
def has_verbosity(self, level):
"""
Checks whether the verbosity level set by the user is greater than or equal to the log level.
Args:
level(int): The level of log.
Returns:
True if the verbosity level set by the user is greater than or equal to the log level, otherwise False.
"""
level = self.check_level(level)
return level >= self.verbosity_level
return self.verbosity_level >= level
def error(self, msg, *args, **kwargs):
self.logger.error(msg, *args, **kwargs)
......
......@@ -15,6 +15,7 @@
from __future__ import print_function
import numpy as np
import logging
import six
from paddle.fluid import log_helper
from paddle.fluid import framework, backward, core
......@@ -334,7 +335,7 @@ class PartialProgramLayer(layers.Layer):
param_and_buffer_names_set.add(var.name)
for block in main_program.blocks:
for name, var in block.vars.items():
for name, var in six.iteritems(block.vars):
if isinstance(var, framework.Parameter):
if name not in param_and_buffer_names_set:
raise ValueError(
......
......@@ -24,6 +24,7 @@ import warnings
import gast
from paddle.fluid import framework
from paddle.fluid import in_dygraph_mode
from paddle.fluid.dygraph import layers
from paddle.fluid.data_feeder import check_type
from paddle.fluid.layers.utils import flatten
......@@ -32,6 +33,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
from paddle.fluid.dygraph.dygraph_to_static import logging_utils
from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
......@@ -283,13 +285,21 @@ class StaticLayer(object):
Return:
Outputs of decorated function.
"""
# 1. call dygraph function directly if not enable `declarative`
if not self._program_trans.enable_declarative:
warnings.warn(
"The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
logging_utils.warn(
"The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
"We will just return dygraph output.")
return self._call_dygraph_function(*args, **kwargs)
if not in_dygraph_mode() and self._program_trans.enable_declarative:
raise RuntimeError(
"Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
"because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
"following API: paddle.disable_static().".format(
self.dygraph_function))
# 2. trace ops from dygraph layers and cache the generated program.
args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
try:
......@@ -393,19 +403,43 @@ class StaticLayer(object):
def concrete_program(self):
"""
Returns recent ConcreteProgram instance of decorated function.
Examples:
.. code-block:: python
import paddle
from paddle.jit import to_static
from paddle.static import InputSpec
paddle.disable_static()
def foo(x, y):
z = x + y
return z
# usage 1:
decorated_foo = to_static(foo, input_spec=[InputSpec([10], name='x'), InputSpec([10], name='y')])
print(decorated_foo.concrete_program)
# usage 2:
decorated_foo = to_static(foo)
out_foo = decorated_foo(paddle.rand([10]), paddle.rand([10]))
print(decorated_foo.concrete_program)
"""
# if specific the `input_spec`, the length of program_cache will always 1,
# else, return the last one.
cached_program_len = len(self._program_cache)
# If specific `input_spec`, apply convertion from dygraph layers into static Program.
if cached_program_len == 0:
if len(self._function_spec.flat_input_spec) > 0:
input_spec = self._function_spec.input_spec
input_spec = self._function_spec.input_spec
has_input_spec = (input_spec is not None and len(input_spec) > 0)
if has_input_spec:
concrete_program, _ = self.get_concrete_program(*input_spec)
return concrete_program
else:
raise ValueError("No valid transformed program for {}".format(
self._function_spec))
raise ValueError(
"No valid transformed program for {}.\n\t Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
format(self._function_spec))
# If more than one programs have been cached, return the recent converted program by default.
elif cached_program_len > 1:
logging.warning(
......@@ -617,7 +651,7 @@ class ProgramCache(object):
return len(self._caches)
def concrete_programs(self):
return [cp for key, (cp, _) in self._caches.iteritems()]
return [cp for key, (cp, _) in six.iteritems(self._caches)]
def synchronized(func):
......
......@@ -754,7 +754,7 @@ def save(layer, model_path, input_spec=None, configs=None):
# saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name
state_names_dict = dict()
for structured_name, var in layer.state_dict().items():
for structured_name, var in six.iteritems(layer.state_dict()):
state_names_dict[var.name] = structured_name
# 3. share parameters from Layer to scope & record var info
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册