提交 68e89f8a 编写于 作者: Z zlsh80826

merge PaddlePaddle/Paddle develop

...@@ -154,17 +154,10 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() { ...@@ -154,17 +154,10 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
C.PD_EnableMkldnnQuantizer(config.c) C.PD_EnableMkldnnQuantizer(config.c)
} }
func (config *AnalysisConfig) EnableMkldnnBfloat16() {
C.PD_EnableMkldnnBfloat16(config.c)
}
func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool { func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c)) return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
} }
func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
}
// SetModelBuffer // SetModelBuffer
// ModelFromMemory // ModelFromMemory
......
...@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name, ...@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
const std::string& fs_ugi) { const std::string& fs_ugi) {
fs_name_ = fs_name; fs_name_ = fs_name;
fs_ugi_ = fs_ugi; fs_ugi_ = fs_ugi;
std::string cmd = std::string("hadoop fs"); std::string cmd = std::string("$HADOOP_HOME/bin/hadoop fs");
cmd += " -D fs.default.name=" + fs_name; cmd += " -D fs.default.name=" + fs_name;
cmd += " -D hadoop.job.ugi=" + fs_ugi; cmd += " -D hadoop.job.ugi=" + fs_ugi;
cmd += " -Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=500000";
paddle::framework::hdfs_set_command(cmd); paddle::framework::hdfs_set_command(cmd);
} }
......
...@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context ...@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
...@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu ...@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
cc_test(exception_holder_test SRCS exception_holder_test.cc ) cc_test(exception_holder_test SRCS exception_holder_test.cc )
......
...@@ -18,7 +18,8 @@ ...@@ -18,7 +18,8 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( ...@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
} }
// Wait FetchOps. // Wait FetchOps.
ClearFetchOp(graph_, &fetch_ops); ClearFetchOp(graph_, &fetch_ops);
for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
}
return fetches; return fetches;
} }
...@@ -162,7 +168,7 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -162,7 +168,7 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
ir::Node *fetch_node = ir::Node *fetch_node =
graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_, auto *op = new FetchAsyncOpHandle(fetch_node, fetches, i, &local_scopes_,
&local_exec_scopes_, return_merged); &local_exec_scopes_, return_merged);
fetch_ops->emplace_back(op); fetch_ops->emplace_back(op);
...@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
op->AddInput(var); op->AddInput(var);
} }
for (auto *var : vars) {
auto *op = var->GeneratedOp();
auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
if (compute_op) {
compute_op->SetLockAndRecordEventFree(false);
}
}
int dep = static_cast<int>(op->NotReadyInputSize()); int dep = static_cast<int>(op->NotReadyInputSize());
(*op_deps)[op] = dep; (*op_deps)[op] = dep;
if (dep == 0) { if (dep == 0) {
...@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { ...@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; } const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) { if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
traced_ops_.emplace_back(op); traced_ops_.emplace_back(op);
} }
} }
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node, FetchResultType *data,
size_t offset,
std::vector<Scope *> *local_scopes,
std::vector<Scope *> *local_exec_scopes,
bool return_merged)
: OpHandleBase(node),
data_(data),
offset_(offset),
local_scopes_(local_scopes),
local_exec_scopes_(local_exec_scopes),
return_merged_(return_merged) {}
FetchAsyncOpHandle::~FetchAsyncOpHandle() {}
void FetchAsyncOpHandle::RecordWaitEventOnCtx(
platform::DeviceContext *waited_ctx) {
PADDLE_THROW(platform::errors::PermissionDenied(
"No nodes need to wait FetchAsyncOp. Unexpceted Error."));
}
static void CheckTensorAttrs(const LoDTensor *tensor,
const proto::VarType::Type &type,
const DataLayout &layout, const DDim &dims,
const LoD &lod, const size_t offset) {
if (tensor->numel() && tensor->IsInitialized()) {
// step1: check type
PADDLE_ENFORCE_EQ(
type, tensor->type(),
platform::errors::InvalidArgument(
"The data type of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
DataTypeToString(type), DataTypeToString(tensor->type()), offset));
// step2: check layout
PADDLE_ENFORCE_EQ(
layout, tensor->layout(),
platform::errors::InvalidArgument(
"The layout of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
DataLayoutToString(layout), DataLayoutToString(tensor->layout()),
offset));
}
// step3: check dims
auto tensor_dims = tensor->dims();
PADDLE_ENFORCE_EQ(dims.size(), tensor_dims.size(),
platform::errors::InvalidArgument(
"The dimension sizes of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
dims, tensor_dims, offset));
for (int j = 1; j < dims.size(); j++) {
PADDLE_ENFORCE_EQ(dims[j], tensor_dims[j],
platform::errors::InvalidArgument(
"The dimensions of fetched Tensors or "
"the items of fetched LoDTensorArray are "
"different from each other on different "
"devices(%s vs %s). And the error is caused by the "
"%zu (th) fetched variable. Please set the "
"parameter `return_merged = False` when "
"you call the `Executor.run()` method.",
dims, tensor_dims, offset));
}
// step4: check lod
PADDLE_ENFORCE_EQ(
lod.size(), tensor->lod().size(),
platform::errors::InvalidArgument(
"The LoD information of fetched Tensors or the items of fetched "
"LoDTensorArray are different from each other on different "
"devices(%s vs %s). And the error is caused by the %zu "
"(th) fetched variable. Please set the "
"parameter `return_merged = False` when you "
"call the `Executor.run()` method.",
lod, tensor->lod(), offset));
}
static void TransData(const framework::Tensor *src_item,
framework::Tensor *dst_item,
const platform::DeviceContext &ctx) {
if (src_item->IsInitialized() && src_item->numel() > 0) {
if (platform::is_gpu_place(src_item->place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
#endif
} else {
TensorCopy(*src_item, platform::CPUPlace(), dst_item);
}
}
}
void FetchAsyncOpHandle::FetchMergedLodTensor(
const std::vector<const LoDTensor *> &src_lodtensors,
LoDTensor *dst_lodtensor) {
// calc dst type,layout,dim,lod and calc check dim
proto::VarType::Type new_type = proto::VarType::FP32;
framework::DataLayout new_layout;
framework::DDim new_dim;
LoD new_lod = src_lodtensors[0]->lod();
framework::DDim check_dim;
for (auto *t : src_lodtensors) {
if (t->numel() && t->IsInitialized()) {
check_dim = t->dims();
new_type = t->type();
new_layout = t->layout();
break;
}
}
bool find_first_dims = false;
for (auto *t : src_lodtensors) {
if (t->numel() && t->IsInitialized()) {
if (!find_first_dims) {
new_dim = t->dims();
find_first_dims = true;
} else {
new_dim[0] += t->dims()[0];
}
}
}
// check src type,layout,dim,lod consistence
for (size_t i = 1; i < src_lodtensors.size(); ++i) {
CheckTensorAttrs(src_lodtensors[i], new_type, new_layout, check_dim,
new_lod, offset_);
}
// set dst tensor
dst_lodtensor->Resize(new_dim);
dst_lodtensor->set_layout(src_lodtensors[0]->layout());
dst_lodtensor->set_lod(src_lodtensors[0]->lod());
if (platform::is_gpu_place(src_lodtensors[0]->place())) {
dst_lodtensor->mutable_data(platform::CUDAPinnedPlace(),
src_lodtensors[0]->type());
} else {
dst_lodtensor->mutable_data(platform::CPUPlace(),
src_lodtensors[0]->type());
}
// slice and memcpy
int begin = 0;
for (auto *src : src_lodtensors) {
int end = begin + src->dims()[0];
if (end == begin) {
continue;
}
auto dst = dst_lodtensor->Slice(begin, end);
TransData(src, &dst, *dev_ctxes_[src->place()]);
begin = end;
}
}
void FetchAsyncOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
// get src vars
auto &scopes = *local_exec_scopes_;
std::vector<Variable *> src_vars;
src_vars.reserve(inputs_.size());
for (size_t i = 0; i < inputs_.size(); ++i) {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
src_vars.emplace_back(var);
}
if (return_merged_) {
auto &val = BOOST_GET(FetchList, *data_);
if (src_vars[0]->IsType<LoDTensor>()) {
// to lodtensor type
std::vector<const LoDTensor *> src_lodtensors;
src_lodtensors.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
src_lodtensors.emplace_back(&src_vars[i]->Get<framework::LoDTensor>());
}
LoDTensor dst_lodtensor;
FetchMergedLodTensor(src_lodtensors, &dst_lodtensor);
val.at(offset_) = std::move(dst_lodtensor);
} else {
// to lodtensorarray type
std::vector<const LoDTensorArray *> src_lodtensor_arrays;
src_lodtensor_arrays.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
src_lodtensor_arrays.emplace_back(
&src_vars[i]->Get<framework::LoDTensorArray>());
}
LoDTensorArray dst_lodtensor_array;
dst_lodtensor_array.resize(src_lodtensor_arrays[0]->size());
for (size_t i = 0; i < dst_lodtensor_array.size(); ++i) {
std::vector<const LoDTensor *> src_lodtensors;
src_lodtensors.reserve(src_lodtensor_arrays.size());
for (size_t j = 0; j < src_lodtensor_arrays.size(); ++j) {
src_lodtensors.emplace_back(&(*src_lodtensor_arrays[j])[i]);
}
FetchMergedLodTensor(src_lodtensors, &dst_lodtensor_array[i]);
}
val.at(offset_) = std::move(dst_lodtensor_array);
}
} else {
auto &val = BOOST_GET(FetchUnmergedList, *data_);
auto &dst_tensors = val.at(offset_);
dst_tensors.reserve(src_vars.size());
for (size_t i = 0; i < src_vars.size(); ++i) {
if (src_vars[i]->IsType<LoDTensor>()) {
auto &t = src_vars[i]->Get<framework::LoDTensor>();
LoDTensor item;
TransData(&t, &item, *dev_ctxes_[t.place()]);
dst_tensors.emplace_back(std::move(item));
} else {
auto &t = src_vars[i]->Get<framework::LoDTensorArray>();
LoDTensorArray item;
item.resize(t.size());
for (size_t j = 0; j < t.size(); ++j) {
TransData(&t[j], &item[j], *dev_ctxes_[t[j].place()]);
}
dst_tensors.emplace_back(std::move(item));
}
}
}
}
bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FetchAsyncOpHandle : public OpHandleBase {
public:
FetchAsyncOpHandle(ir::Node *node, FetchResultType *data, size_t offset,
std::vector<Scope *> *local_scopes,
std::vector<Scope *> *local_exec_scopes,
bool return_merged);
~FetchAsyncOpHandle();
void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
std::string Name() const override;
bool IsMultiDeviceTransfer() override;
protected:
void RunImpl() override;
std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
void FetchMergedLodTensor(
const std::vector<const LoDTensor *> &src_lodtensors,
LoDTensor *dst_lodtensor);
private:
FetchResultType *data_;
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<Scope *> *local_exec_scopes_;
bool return_merged_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data, ...@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data,
FetchOpHandle::~FetchOpHandle() {} FetchOpHandle::~FetchOpHandle() {}
void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); PADDLE_THROW(platform::errors::PermissionDenied(
"No nodes need to wait FetchOp. Unexpceted Error."));
} }
static void CheckDims(const framework::DDim &tensor_dims, static void CheckDims(const framework::DDim &tensor_dims,
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) { ...@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
if (fetch_ops->empty()) return; if (fetch_ops->empty()) return;
for (auto& op : *fetch_ops) { for (auto& op : *fetch_ops) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchOpHandle*>(op), dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
"The input ops of ClearFetchOp function should be FetchOpHandle."); true,
"The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle.");
for (auto& out_var : op->Node()->outputs) { for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var); graph->RemoveNode(out_var);
} }
......
...@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( ...@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
float* g = g_tensor->data<float>(); float* g = g_tensor->data<float>();
if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
int dim = emb_dim + offset; int dim = emb_dim;
Eigen::Map< Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / dim, dim); g_mat(g, g_tensor->numel() / dim, dim);
...@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id, ...@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
#endif #endif
} }
void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->load_with_whitelist(table_id, path,
std::to_string(mode));
ret.wait();
if (ret.get() != 0) {
LOG(ERROR) << "load model of table id: " << table_id
<< ", from path: " << path << " failed";
}
#else
VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
#endif
}
void FleetWrapper::SaveModel(const std::string& path, const int mode) { void FleetWrapper::SaveModel(const std::string& path, const int mode) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode)); auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
...@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path, ...@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
#endif #endif
} }
int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
const int mode,
const std::string& whitelist_path) {
#ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->save_with_whitelist(
table_id, path, std::to_string(mode), whitelist_path);
ret.wait();
int32_t feasign_cnt = ret.get();
if (feasign_cnt == -1) {
LOG(ERROR) << "table save cache failed";
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
return feasign_cnt;
#else
VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
return -1;
#endif
}
void FleetWrapper::ShrinkSparseTable(int table_id) { void FleetWrapper::ShrinkSparseTable(int table_id) {
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto ret = pslib_ptr_->_worker_ptr->shrink(table_id); auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
......
...@@ -273,6 +273,11 @@ class FleetWrapper { ...@@ -273,6 +273,11 @@ class FleetWrapper {
// save cache model // save cache model
// cache model can speed up online predict // cache model can speed up online predict
int32_t SaveCache(int table_id, const std::string& path, const int mode); int32_t SaveCache(int table_id, const std::string& path, const int mode);
// save sparse table filtered by user-defined whitelist
int32_t SaveWithWhitelist(int table_id, const std::string& path,
const int mode, const std::string& whitelist_path);
void LoadWithWhitelist(const uint64_t table_id, const std::string& path,
const int mode);
// copy feasign key/value from src_table_id to dest_table_id // copy feasign key/value from src_table_id to dest_table_id
int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id); int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
// copy feasign key/value from src_table_id to dest_table_id // copy feasign key/value from src_table_id to dest_table_id
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(use_mkldnn);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs, const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_backward) { const platform::Place& place, bool trace_backward) {
VLOG(1) << "Trace Op: " << type; VLOG(1) << "Trace Op: " << type;
if (FLAGS_use_mkldnn) {
attrs["use_mkldnn"] = true;
}
auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
const auto& op_info = op->Info(); const auto& op_info = op->Info();
auto* attr_checker = op_info.Checker(); auto* attr_checker = op_info.Checker();
......
...@@ -217,17 +217,6 @@ void AnalysisConfig::EnableMkldnnQuantizer() { ...@@ -217,17 +217,6 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
Update(); Update();
} }
void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
use_mkldnn_bfloat16_ = false;
#endif
Update();
}
MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
"MkldnnQuantizer was not enabled yet."); "MkldnnQuantizer was not enabled yet.");
...@@ -341,12 +330,6 @@ void AnalysisConfig::Update() { ...@@ -341,12 +330,6 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (use_mkldnn_bfloat16_) {
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnBfloat16();
#endif
}
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Do not optimize when mkldnn is on // Do not optimize when mkldnn is on
if (enable_memory_optim_ && !use_mkldnn_) { if (enable_memory_optim_ && !use_mkldnn_) {
...@@ -415,7 +398,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -415,7 +398,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << ";"; ss << ";";
ss << use_mkldnn_quantizer_; ss << use_mkldnn_quantizer_;
ss << use_mkldnn_bfloat16_;
ss << model_from_memory_; ss << model_from_memory_;
ss << with_profile_; ss << with_profile_;
......
...@@ -485,25 +485,4 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { ...@@ -485,25 +485,4 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
#else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
#endif
}
#endif
TEST(AnalysisPredictor, bf16_pass_strategy) {
std::vector<std::string> passes;
PassStrategy passStrategy(passes);
passStrategy.EnableMkldnnBfloat16();
}
} // namespace paddle } // namespace paddle
...@@ -401,19 +401,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -401,19 +401,6 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
void EnableMkldnnQuantizer(); void EnableMkldnnQuantizer();
///
/// \brief Turn on MKLDNN bfloat16.
///
///
void EnableMkldnnBfloat16();
///
/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
///
/// \return bool Whether to use the MKLDNN Bfloat16.
///
bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
/// ///
/// \brief A boolean state telling whether the thread local CUDA stream is /// \brief A boolean state telling whether the thread local CUDA stream is
/// enabled. /// enabled.
...@@ -605,7 +592,6 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -605,7 +592,6 @@ struct PD_INFER_DECL AnalysisConfig {
int mkldnn_cache_capacity_{0}; int mkldnn_cache_capacity_{0};
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_; std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
bool use_mkldnn_bfloat16_{false};
// If the config is already used on a predictor, it becomes invalid. // If the config is already used on a predictor, it becomes invalid.
// Any config can only be used with one predictor. // Any config can only be used with one predictor.
......
...@@ -143,10 +143,6 @@ void GpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -143,10 +143,6 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
LOG(ERROR) << "GPU not support MKL-DNN quantization"; LOG(ERROR) << "GPU not support MKL-DNN quantization";
} }
void GpuPassStrategy::EnableMkldnnBfloat16() {
LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
}
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will // NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones. // not be damaged by smaller ones.
...@@ -229,12 +225,4 @@ void CpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -229,12 +225,4 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
#endif #endif
} }
void CpuPassStrategy::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
use_mkldnn_bfloat16_ = true;
#else
use_mkldnn_bfloat16_ = false;
#endif
}
} // namespace paddle } // namespace paddle
...@@ -132,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -132,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
virtual void EnableMkldnnQuantizer() {} virtual void EnableMkldnnQuantizer() {}
/// \brief Enable MKLDNN bfloat16.
virtual void EnableMkldnnBfloat16() {}
/// \brief Check if we are using gpu. /// \brief Check if we are using gpu.
/// \return A bool variable implying whether we are in gpu mode. /// \return A bool variable implying whether we are in gpu mode.
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -164,7 +161,6 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -164,7 +161,6 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
use_gpu_ = other.use_gpu_; use_gpu_ = other.use_gpu_;
use_mkldnn_ = other.use_mkldnn_; use_mkldnn_ = other.use_mkldnn_;
use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
} }
/// \brief Default destructor. /// \brief Default destructor.
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
...@@ -178,13 +174,9 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy { ...@@ -178,13 +174,9 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
/// \brief Enable MKLDNN quantize optimization. /// \brief Enable MKLDNN quantize optimization.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Enable MKLDNN bfloat16.
void EnableMkldnnBfloat16() override;
protected: protected:
/// \cond Protected /// \cond Protected
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
bool use_mkldnn_bfloat16_{false};
/// \endcond /// \endcond
}; };
...@@ -213,9 +205,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -213,9 +205,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// \brief Not supported in GPU mode yet. /// \brief Not supported in GPU mode yet.
void EnableMkldnnQuantizer() override; void EnableMkldnnQuantizer() override;
/// \brief Not supported in GPU mode yet.
void EnableMkldnnBfloat16() override;
/// \brief Default destructor. /// \brief Default destructor.
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
......
...@@ -235,12 +235,6 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer( ...@@ -235,12 +235,6 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled( PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
const PD_AnalysisConfig* config); const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
const PD_AnalysisConfig* config);
PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config, PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
const char* prog_buffer, const char* prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
......
...@@ -207,18 +207,6 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) { ...@@ -207,18 +207,6 @@ bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
return config->config.mkldnn_quantizer_enabled(); return config->config.mkldnn_quantizer_enabled();
} }
void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
config->config.EnableMkldnnBfloat16();
}
bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
"PD_AnalysisConfig should not be null"));
return config->config.mkldnn_bfloat16_enabled();
}
void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer, void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
size_t prog_buffer_size, const char* params_buffer, size_t prog_buffer_size, const char* params_buffer,
size_t params_buffer_size) { size_t params_buffer_size) {
......
...@@ -187,6 +187,14 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -187,6 +187,14 @@ void TensorRTEngine::FreezeNetwork() {
Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
} }
infer_builder_config_->addOptimizationProfile(optim_profile_); infer_builder_config_->addOptimizationProfile(optim_profile_);
infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8) {
// Due to a bug of TRT, we must set precision BuilderFlag to kFP16 before
// kINT8 here to perform INT8 inference.
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
}
if (WithFp16()) { if (WithFp16()) {
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
if (disable_trt_plugin_fp16()) { if (disable_trt_plugin_fp16()) {
......
...@@ -45,12 +45,13 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -45,12 +45,13 @@ struct SimpleOpTypeSetTeller : public Teller {
private: private:
// use this set for no calib int8. // use this set for no calib int8.
std::unordered_set<std::string> int8_teller_set{"matmul", std::unordered_set<std::string> int8_teller_set{"mul",
"conv2d", "conv2d",
"pool2d", "pool2d",
"relu", "relu",
"depthwise_conv2d", "depthwise_conv2d",
"softmax", "softmax",
"sigmoid",
"batch_norm", "batch_norm",
"elementwise_add", "elementwise_add",
"leaky_relu", "leaky_relu",
......
...@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions( ...@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
auto stri_0 = expr_builder.constant(strides_[0]); auto stri_0 = expr_builder.constant(strides_[0]);
auto stri_1 = expr_builder.constant(strides_[1]); auto stri_1 = expr_builder.constant(strides_[1]);
auto one_value = expr_builder.constant(1);
auto tmp1_0 = auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1); auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
auto tmp1_1 =
expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
auto tmp2_0 = expr_builder.constant( auto ceil_tmp =
(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1); expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
auto tmp2_1 = expr_builder.constant( auto ceil1_tmp =
(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1); expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[2], *stri_0);
auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[3], *stri_1);
if (!ceil_mode_) { if (!ceil_mode_) {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM, output.d[2] = expr_builder.operation(
*a_d, *tmp1_0); nvinfer1::DimensionOperation::kSUM,
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM, *expr_builder.operation(
*b_d, *tmp1_1); nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[2], *v0_tmp),
*stri_0),
*one_value);
output.d[3] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[3], *v1_tmp),
*stri_1),
*one_value);
} else { } else {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM, output.d[2] = expr_builder.operation(
*a_d, *tmp2_0); nvinfer1::DimensionOperation::kSUM,
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM, *expr_builder.operation(
*b_d, *tmp2_1); nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[2], *ceil_tmp),
*stri_0),
*one_value);
output.d[3] = expr_builder.operation(
nvinfer1::DimensionOperation::kSUM,
*expr_builder.operation(
nvinfer1::DimensionOperation::kFLOOR_DIV,
*expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*inputs[0].d[3], *ceil1_tmp),
*stri_1),
*one_value);
} }
return output; return output;
......
...@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -54,7 +54,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
auto ptr = new SkipLayerNormPluginDynamic( auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_; ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_; ptr->scale_gpu_ = scale_gpu_;
return ptr; return ptr;
} }
......
...@@ -24,6 +24,39 @@ namespace tensorrt { ...@@ -24,6 +24,39 @@ namespace tensorrt {
namespace plugin { namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
: axis_(axis), num_stack_(num_stack) {}
StackPluginDynamic::StackPluginDynamic(void const* serial_data,
size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &num_stack_);
}
StackPluginDynamic::~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
return new StackPluginDynamic(axis_, num_stack_);
}
const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
int StackPluginDynamic::getNbOutputs() const { return 1; }
int StackPluginDynamic::initialize() { return 0; }
size_t StackPluginDynamic::getSerializationSize() const {
size_t serialize_size = 0;
serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_);
return serialize_size;
}
void StackPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_);
}
nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) { nvinfer1::IExprBuilder& expr_builder) {
...@@ -37,6 +70,20 @@ nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions( ...@@ -37,6 +70,20 @@ nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
return output; return output;
} }
void StackPluginDynamic::configurePlugin(
const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
size_t StackPluginDynamic::getWorkspaceSize(
const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
return num_stack_ * sizeof(uintptr_t);
}
void StackPluginDynamic::destroy() { delete this; }
void StackPluginDynamic::terminate() {}
bool StackPluginDynamic::supportsFormatCombination( bool StackPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs, int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
int nb_outputs) { int nb_outputs) {
...@@ -109,8 +156,11 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -109,8 +156,11 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
lead_unit *= out_dims.d[i]; lead_unit *= out_dims.d[i];
} }
cudaMemcpyAsync(reinterpret_cast<void*>(in_ptr_gpu_), PADDLE_ENFORCE_EQ(
reinterpret_cast<const void* const>(inputs), out_dims.d[axis_], num_stack_,
platform::errors::InvalidArgument("number of stack axis should be same"));
cudaMemcpyAsync(workspace, reinterpret_cast<const void* const>(inputs),
sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice, sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
stream); stream);
...@@ -122,13 +172,13 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -122,13 +172,13 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
if (infer_type == nvinfer1::DataType::kFLOAT) { if (infer_type == nvinfer1::DataType::kFLOAT) {
float* output = static_cast<float*>(outputs[0]); float* output = static_cast<float*>(outputs[0]);
StackKernel<float><<<num_blocks, num_threads, 0, stream>>>( StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const float* const*>(in_ptr_gpu_), output, num_stacks, reinterpret_cast<const float* const*>(workspace), output, num_stacks,
base_unit); base_unit);
} else if (infer_type == nvinfer1::DataType::kHALF) { } else if (infer_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef SUPPORTS_CUDA_FP16
__half* output = static_cast<__half*>(outputs[0]); __half* output = static_cast<__half*>(outputs[0]);
StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>( StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const __half* const*>(in_ptr_gpu_), output, num_stacks, reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
base_unit); base_unit);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
...@@ -141,6 +191,54 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -141,6 +191,54 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
StackPluginDynamicCreator::StackPluginDynamicCreator() {}
const char* StackPluginDynamicCreator::getPluginName() const {
return "stack_plugin";
}
const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
const nvinfer1::PluginFieldCollection*
StackPluginDynamicCreator::getFieldNames() {
return &field_collection_;
}
nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) {
int axis = -1;
int num_stack = -1;
for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name);
if (name == "axis") {
axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else {
PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
name +
"' when creating stack op plugin."));
}
}
return new StackPluginDynamic(axis, num_stack);
}
nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
const char* name, const void* serial_data, size_t serial_length) {
auto plugin = new StackPluginDynamic(serial_data, serial_length);
return plugin;
}
void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
plugin_namespace_ = lib_namespace;
}
const char* StackPluginDynamicCreator::getPluginNamespace() const {
return plugin_namespace_.c_str();
}
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -28,68 +28,24 @@ namespace plugin { ...@@ -28,68 +28,24 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class StackPluginDynamic : public DynamicPluginTensorRT { class StackPluginDynamic : public DynamicPluginTensorRT {
public: public:
StackPluginDynamic(int axis, int num_stack) explicit StackPluginDynamic(int axis, int num_stack);
: axis_(axis), num_stack_(num_stack) { StackPluginDynamic(void const* serial_data, size_t serial_length);
init(); ~StackPluginDynamic();
} nvinfer1::IPluginV2DynamicExt* clone() const override;
StackPluginDynamic(void const* serialData, size_t serialLength) {
DeserializeValue(&serialData, &serialLength, &axis_);
DeserializeValue(&serialData, &serialLength, &num_stack_);
init();
}
~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new StackPluginDynamic(axis_, num_stack_);
}
void init() {
int device_id;
cudaGetDevice(&device_id);
in_ptr_tensor_.Resize({num_stack_});
in_ptr_gpu_ =
in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
}
const char* getPluginType() const override { return "stack_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override { return 0; }
size_t getSerializationSize() const override {
size_t serialize_size = 0;
serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_);
return serialize_size;
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) override; nvinfer1::IExprBuilder& exprBuilder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override; int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nbOutputs) override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nbOutputs) const override;
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
...@@ -99,68 +55,39 @@ class StackPluginDynamic : public DynamicPluginTensorRT { ...@@ -99,68 +55,39 @@ class StackPluginDynamic : public DynamicPluginTensorRT {
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* inputTypes,
int nbInputs) const override; int nbInputs) const override;
void destroy() override { delete this; } const char* getPluginType() const override;
int getNbOutputs() const override;
int initialize() override;
void terminate() override;
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
void destroy() override;
private: private:
int axis_; int axis_;
int num_stack_; int num_stack_;
framework::Tensor in_ptr_tensor_;
int64_t* in_ptr_gpu_;
}; };
class StackPluginV2Creator : public nvinfer1::IPluginCreator { class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
public: public:
StackPluginV2Creator() {} StackPluginDynamicCreator();
const char* getPluginName() const override { return "stack_plugin"; } const char* getPluginName() const override;
const char* getPluginVersion() const override;
const char* getPluginVersion() const override { return "1"; } const nvinfer1::PluginFieldCollection* getFieldNames() override;
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin( nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override { const char* name, const nvinfer1::PluginFieldCollection* fc) override;
int axis = -1;
int num_stack = -1;
for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name);
if (name == "axis") {
axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else {
PADDLE_THROW(
platform::errors::Fatal("Meet an unknown plugin field '" + name +
"' when creating stack op plugin."));
}
}
return new StackPluginDynamic(axis, num_stack);
}
nvinfer1::IPluginV2* deserializePlugin(const char* name, nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data, const void* serial_data,
size_t serial_length) override { size_t serial_length) override;
auto plugin = new StackPluginDynamic(serial_data, serial_length); void setPluginNamespace(const char* lib_namespace) override;
return plugin; const char* getPluginNamespace() const override;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private: private:
std::string plugin_namespace_; std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_; std::vector<nvinfer1::PluginField> plugin_attributes_;
}; };
REGISTER_TRT_PLUGIN_V2(StackPluginV2Creator); REGISTER_TRT_PLUGIN_V2(StackPluginDynamicCreator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -431,9 +431,9 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -431,9 +431,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model") set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}) if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz") inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
endif() endif()
inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
......
...@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) { ...@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) {
PD_SwitchIrOptim(config, true); PD_SwitchIrOptim(config, true);
bool ir_optim = PD_IrOptim(config); bool ir_optim = PD_IrOptim(config);
CHECK(ir_optim) << "NO"; CHECK(ir_optim) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(!bfloat16_enable) << "NO";
PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false, PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
false); false);
bool trt_enable = PD_TensorrtEngineEnabled(config); bool trt_enable = PD_TensorrtEngineEnabled(config);
......
...@@ -88,9 +88,6 @@ TEST(PD_AnalysisConfig, profile_mkldnn) { ...@@ -88,9 +88,6 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
PD_EnableMkldnnQuantizer(config); PD_EnableMkldnnQuantizer(config);
bool quantizer_enable = PD_MkldnnQuantizerEnabled(config); bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
CHECK(quantizer_enable) << "NO"; CHECK(quantizer_enable) << "NO";
PD_EnableMkldnnBfloat16(config);
bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
CHECK(bfloat16_enable) << "NO";
PD_SetMkldnnCacheCapacity(config, 0); PD_SetMkldnnCacheCapacity(config, 0);
PD_SetModel(config, prog_file.c_str(), params_file.c_str()); PD_SetModel(config, prog_file.c_str(), params_file.c_str());
PD_DeleteAnalysisConfig(config); PD_DeleteAnalysisConfig(config);
......
...@@ -25,12 +25,20 @@ namespace inference { ...@@ -25,12 +25,20 @@ namespace inference {
TEST(quant_int8, resnet50) { TEST(quant_int8, resnet50) {
std::string model_dir = FLAGS_infer_model; std::string model_dir = FLAGS_infer_model;
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(1000, 0);
config.SetModel(model_dir); config.SetModel(model_dir);
config.SwitchUseFeedFetchOps(false); config.SwitchUseFeedFetchOps(false);
config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8, config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8,
false, false); false, false);
std::map<std::string, std::vector<int>> min_input_shape = {
{"image", {1, 1, 3, 3}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"image", {1, 1, 10, 10}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"image", {1, 1, 3, 3}}};
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames(); auto input_names = predictor->GetInputNames();
int channels = 1; int channels = 1;
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
...@@ -243,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC( ...@@ -243,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Operator. Computes cosine of x element-wise. Cosine Operator. Computes cosine of x element-wise.
Input range is `(-inf, inf)` and output range is `[-1,1]`. Input range is `(-inf, inf)` and output range is `[-1,1]`.
Return `nan` if input is out of boundary.
$$out = cos(x)$$ $$out = cos(x)$$
...@@ -341,7 +341,9 @@ $$out = \cos^{-1}(x)$$ ...@@ -341,7 +341,9 @@ $$out = \cos^{-1}(x)$$
class AsinOpMaker : public framework::OpProtoAndCheckerMaker { class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "Input of asin operator"); AddInput("X",
"Input of asin operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of asin operator"); AddOutput("Out", "Output of asin operator");
AddComment(R"DOC( AddComment(R"DOC(
Arcsine Operator. Arcsine Operator.
...@@ -355,7 +357,9 @@ $$out = \sin^{-1}(x)$$ ...@@ -355,7 +357,9 @@ $$out = \sin^{-1}(x)$$
class AtanOpMaker : public framework::OpProtoAndCheckerMaker { class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "Input of atan operator"); AddInput("X",
"Input of atan operator, an N-D Tensor, with data type float32, "
"float64 or float16.");
AddOutput("Out", "Output of atan operator"); AddOutput("Out", "Output of atan operator");
AddComment(R"DOC( AddComment(R"DOC(
Arctangent Operator. Arctangent Operator.
...@@ -1231,3 +1235,34 @@ REGISTER_OP_CPU_KERNEL( ...@@ -1231,3 +1235,34 @@ REGISTER_OP_CPU_KERNEL(
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::AbsGradFunctor<int64_t>>); ops::AbsGradFunctor<int64_t>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(leaky_relu)
.AddCheckpoint(
R"ROC(fix leaky_relu, bahavior changed when alpha < 0 or alpha > 1)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"leaky_relu calculate formula before checkponit: out = max(x, "
"alpha * x); after checkpoint: out = x if x > 0 else alpha * "
"x"));
REGISTER_OP_VERSION(hard_shrink)
.AddCheckpoint(
R"ROC(fix hard_shrink, bahavior changed when threshold<0)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged(
"hard_shrink calculate formula before checkponit: out = x * "
"((x < -threshold) + (x > threshold)); after checkpoint: out = "
"x * (((x < -threshold) + (x > threshold)) > 0)"));
REGISTER_OP_VERSION(softplus)
.AddCheckpoint(
R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
" softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
" stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("beta", "The beta value of the new formula", 1.0f)
.NewAttr("threshold", "The threshold value of the new formula",
20.0f));
/* ========================================================================== */
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/arg_min_max_op_base.h" #include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR( REGISTER_OPERATOR(
...@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL( ...@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
int16_t>, int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
uint8_t>); uint8_t>);
REGISTER_OP_VERSION(arg_max)
.AddCheckpoint(
R"ROC(
Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("flatten",
"In order to compute the argmax over the flattened array "
"when the "
"argument `axis` in python API is None.",
false)
.ModifyAttr(
"dtype",
"change the default value of dtype, the older version "
"is -1, means return the int64 indices."
"The new version is 3, return the int64 indices directly."
"And supporting the dtype of -1 in new version.",
3));
...@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor { ...@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor {
auto axis = ctx.Attr<int64_t>("axis"); auto axis = ctx.Attr<int64_t>("axis");
auto keepdims = ctx.Attr<bool>("keepdims"); auto keepdims = ctx.Attr<bool>("keepdims");
const bool& flatten = ctx.Attr<bool>("flatten"); const bool& flatten = ctx.Attr<bool>("flatten");
// paddle do not have the scalar tensor, just return the shape [1] tensor
if (flatten) keepdims = true;
// if flatten, will construct the new dims for the cacluate // if flatten, will construct the new dims for the cacluate
framework::DDim x_dims; framework::DDim x_dims;
...@@ -164,15 +166,30 @@ class ArgMinMaxOp : public framework::OperatorWithKernel { ...@@ -164,15 +166,30 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size())); "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank;
if (ctx->IsRuntime()) {
const int& dtype = ctx->Attrs().Get<int>("dtype");
if (dtype == framework::proto::VarType::INT32) {
int64_t all_element_num = 0;
if (flatten) {
all_element_num = framework::product(x_dims);
} else {
all_element_num = x_dims[axis];
}
PADDLE_ENFORCE_LE(
all_element_num, INT_MAX,
"The element num of the argmin/argmax input at axis is "
"%d, is larger than int32 maximum value:%d, you must "
"set the dtype of argmin/argmax to 'int64'.",
all_element_num, INT_MAX);
}
}
std::vector<int64_t> vec; std::vector<int64_t> vec;
if (flatten) { if (flatten) {
// if is flatten, will return the only on element
if (keepdims) {
vec.emplace_back(static_cast<int64_t>(1)); vec.emplace_back(static_cast<int64_t>(1));
}
} else { } else {
auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank;
for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
if (keepdims) { if (keepdims) {
vec.emplace_back(static_cast<int64_t>(1)); vec.emplace_back(static_cast<int64_t>(1));
...@@ -194,10 +211,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -194,10 +211,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "Output tensor."); AddOutput("Out", "Output tensor.");
AddAttr<int64_t>("axis", "The axis in which to compute the arg indics."); AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false); AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
AddAttr<bool>("flatten", AddAttr<bool>("flatten",
"Flatten the input value, and search the min or max indices") "Flatten the input value, and search the min or max indices")
.SetDefault(false); .SetDefault(false);
AddAttr<int>("dtype",
"(int, 3), the dtype of indices, the indices dtype must be "
"int32, int64."
"default dtype is int64, and proto value is 3.")
.SetDefault(3);
AddComment(string::Sprintf(R"DOC( AddComment(string::Sprintf(R"DOC(
%s Operator. %s Operator.
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/arg_min_max_op_base.h" #include "paddle/fluid/operators/arg_min_max_op_base.h"
REGISTER_OPERATOR( REGISTER_OPERATOR(
...@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL( ...@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
int16_t>, int16_t>,
paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
uint8_t>); uint8_t>);
REGISTER_OP_VERSION(arg_min)
.AddCheckpoint(
R"ROC(
Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("flatten",
"In order to compute the argmin over the flattened array "
"when the "
"argument `axis` in python API is None.",
false)
.ModifyAttr(
"dtype",
"change the default value of dtype, the older version "
"is -1, means return the int64 indices."
"The new version is 3, return the int64 indices directly."
"And supporting the dtype of -1 in new version.",
3));
...@@ -31,6 +31,10 @@ struct BernoulliCudaFunctor { ...@@ -31,6 +31,10 @@ struct BernoulliCudaFunctor {
__host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {} __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
__host__ __device__ T operator()(const unsigned int n, const T p) const { __host__ __device__ T operator()(const unsigned int n, const T p) const {
// NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
// lines of error messages if, and it should be refined.
PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
"The probability should be >=0 and <= 1, but got %f", p);
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed_); rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(0.0, 1.0); thrust::uniform_real_distribution<T> dist(0.0, 1.0);
......
...@@ -25,10 +25,12 @@ namespace operators { ...@@ -25,10 +25,12 @@ namespace operators {
template <typename T> template <typename T>
inline HOSTDEVICE T BernoulliFunctor(T p, T rand) { inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange( PADDLE_ENFORCE_LE(p, 1.0,
platform::errors::OutOfRange(
"The probability should be <= 1, but got %f", p)); "The probability should be <= 1, but got %f", p));
PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange( PADDLE_ENFORCE_GE(p, 0.0,
"The probability should be >= 1, but got %f", p)); platform::errors::OutOfRange(
"The probability should be >= 0, but got %f", p));
return static_cast<T>(rand < p); return static_cast<T>(rand < p);
} }
......
...@@ -56,7 +56,7 @@ endif() ...@@ -56,7 +56,7 @@ endif()
cc_test(rpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op) DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op scale_op)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
......
...@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h, ...@@ -132,6 +132,15 @@ void ProcGetResponse(const VarHandle& var_h,
&trainer_id); &trainer_id);
} }
void ProcGetRecvResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) {
VLOG(4) << "ProcGetRecvResponse";
framework::Variable* outvar = nullptr;
int trainer_id;
DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
&trainer_id);
}
template <typename T> template <typename T>
void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
::grpc::Slice slice(proto.ByteSizeLong()); ::grpc::Slice slice(proto.ByteSizeLong());
...@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify( ...@@ -482,6 +491,79 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
return h; return h;
} }
VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string send_var_name_val = send_var_name;
const std::string recv_var_name_val = recv_var_name;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
const std::string method = kSendAndRecvRPC;
VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
<< send_var_name_val << " Recv_var_name: " << recv_var_name_val;
int retry_times_ = 0;
while (true) {
SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
VarHandlePtr h_recv(
new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
s->RecvPrepare(h_recv);
framework::AsyncIO([send_var_name_val, recv_var_name_val, table_name_val,
p_scope, p_ctx, s, method, h, this] {
auto* send_var = p_scope->FindVar(send_var_name_val);
send_var->GetMutable<framework::LoDTensor>()->set_lod({});
::grpc::ByteBuffer buf;
VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
<< send_var_name_val
<< " recv_var_name_val: " << recv_var_name_val;
SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
recv_var_name_val, trainer_id_, table_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
s->response_call_back_ = ProcGetRecvResponse;
platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
});
req_count_++;
if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
h->Wait();
if (h->should_retry) {
VLOG(3) << "rpc call failed, retry times " << retry_times_;
retry_times_++;
std::random_device rd;
std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
continue;
}
}
return h;
}
}
bool GRPCClient::Wait() { bool GRPCClient::Wait() {
std::unique_lock<std::mutex> lk(sync_mutex_); std::unique_lock<std::mutex> lk(sync_mutex_);
sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
......
...@@ -53,6 +53,8 @@ namespace distributed { ...@@ -53,6 +53,8 @@ namespace distributed {
void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor { class BaseProcessor {
public: public:
BaseProcessor() { context_ = nullptr; } BaseProcessor() { context_ = nullptr; }
...@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor { ...@@ -131,6 +133,28 @@ class GetProcessor : public BaseProcessor {
RequestGetCallBack response_call_back_ = ProcGetResponse; RequestGetCallBack response_call_back_ = ProcGetResponse;
}; };
class SendAndRecvProcessor : public BaseProcessor {
public:
explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(), stub_g_(ch) {}
virtual ~SendAndRecvProcessor() {}
void ProcessImpl() override {
if (response_call_back_) {
response_call_back_(*var_h_recv_.get(), reply_);
var_h_recv_->Finish(true);
}
}
void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
::grpc::ByteBuffer reply_;
::grpc::GenericStub stub_g_;
RequestGetCallBack response_call_back_ = ProcGetResponse;
VarHandlePtr var_h_recv_;
};
class BatchBarrierProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor {
public: public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
...@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient { ...@@ -231,6 +255,14 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendAndRecv(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& send_var_name,
const std::string& recv_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendComplete( VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
......
...@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -76,7 +76,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
PADDLE_THROW("Serialize does not support type: %s", PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name()); typeid(var->Type()).name());
} }
std::string header; std::string header;
request.AppendToString(&header); request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]); auto buffer = std::unique_ptr<char[]>(new char[1024]);
...@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -101,7 +100,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} }
#endif #endif
PADDLE_ENFORCE_NOT_NULL(payload); PADDLE_ENFORCE_NOT_NULL(payload);
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
payload->memory_size()); payload->memory_size());
if (payload->memory_size() >= std::numeric_limits<int>::max()) { if (payload->memory_size() >= std::numeric_limits<int>::max()) {
...@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -140,7 +138,6 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::Slice::STEAL_REF); ::grpc::Slice::STEAL_REF);
num_slices = 4; num_slices = 4;
} }
::grpc::ByteBuffer tmp(&slices[0], num_slices); ::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp); msg->Swap(&tmp);
} }
...@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -156,6 +153,19 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
*trainer_id = resp.GetTrainerId(); *trainer_id = resp.GetTrainerId();
} }
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE_EQ(
resp.Parse(msg), 0,
platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
*var = resp.GetRecvVar();
*trainer_id = resp.GetTrainerId();
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -47,6 +47,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var, int* trainer_id); framework::Variable** var, int* trainer_id);
void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id);
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port); ...@@ -28,6 +28,7 @@ DECLARE_int32(rpc_retry_bind_port);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
enum CallStatus { PROCESS = 0, FINISH }; enum CallStatus { PROCESS = 0, FINISH };
// reference: // reference:
...@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase { ...@@ -433,6 +434,51 @@ class RequestNotify final : public RequestBase {
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_; ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
}; };
class RequestSendAndRecv final : public RequestBase {
public:
explicit RequestSendAndRecv(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new GRPCVariableResponse(
request_handler->scope(), request_handler->dev_ctx(),
request_handler->distributed_mode()));
int method_id =
static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestSendAndRecv() {}
std::string GetReqName() override { return request_->Varname(); }
void Process() override {
std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname();
std::string table_name = request_->TableName();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name << " trainer: " << trainer_id;
auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = nullptr;
request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
out_var_name, table_name);
SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
&reply_);
Finish(reply_, &responder_);
}
protected:
std::shared_ptr<GRPCVariableResponse> request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
};
void AsyncGRPCServer::WaitServerReady() { void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is waiting server ready"; VLOG(4) << "AsyncGRPCServer is waiting server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_); std::unique_lock<std::mutex> lock(this->mutex_ready_);
...@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -586,6 +632,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestNotify) { } else if (rpc_name == kRequestNotify) {
b = new RequestNotify(service_.get(), cq.get(), handler, req_id); b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
} else if (rpc_name == kRequestSendAndRecv) {
b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
} else { } else {
PADDLE_ENFORCE(false, "not supported rpc"); PADDLE_ENFORCE(false, "not supported rpc");
} }
......
...@@ -85,10 +85,12 @@ enum class GrpcMethod { ...@@ -85,10 +85,12 @@ enum class GrpcMethod {
kGetMonomerVariable, kGetMonomerVariable,
kGetMonomerBarrier, kGetMonomerBarrier,
kRequestNotify, kRequestNotify,
kRequestSendAndRecv,
// when you add new handler, change kGrpcNumMethods at the same time!
}; };
static const int kGrpcNumMethods = static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kRequestNotify) + 1; static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
inline const char* GrpcMethodName(GrpcMethod id) { inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) { switch (id) {
...@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { ...@@ -108,6 +110,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/CheckpointNotify"; return "/sendrecv.SendRecvService/CheckpointNotify";
case GrpcMethod::kRequestNotify: case GrpcMethod::kRequestNotify:
return "/sendrecv.SendRecvService/DistributeNotify"; return "/sendrecv.SendRecvService/DistributeNotify";
case GrpcMethod::kRequestSendAndRecv:
return "/sendrecv.SendRecvService/SendAndRecvVariable";
} }
// Shouldn't be reached. // Shouldn't be reached.
......
...@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; ...@@ -46,6 +46,7 @@ constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
constexpr char kRequestNotify[] = "RequestNotify"; constexpr char kRequestNotify[] = "RequestNotify";
constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
constexpr char kSendRPC[] = "SendRPC"; constexpr char kSendRPC[] = "SendRPC";
constexpr char kGetRPC[] = "GetRPC"; constexpr char kGetRPC[] = "GetRPC";
...@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; ...@@ -57,6 +58,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
constexpr int64_t kPrefetchTimeout = 60000; constexpr int64_t kPrefetchTimeout = 60000;
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
......
...@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname, ...@@ -325,6 +325,22 @@ bool RequestNotifyHandler::Handle(const std::string &varname,
return true; return true;
} }
bool RequestSendAndRecvHandler::Handle(const std::string &varname,
framework::Scope *Scope,
framework::Variable *var,
framework::Variable **outvar,
const int trainer_id,
const std::string &out_var_name,
const std::string &table_name) {
VLOG(3) << "SendAndRecvHandle: " << varname
<< " out_var_name: " << out_var_name
<< " , trainer_id: " << trainer_id;
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
*outvar = Scope->FindVar(out_var_name);
return true;
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler { ...@@ -176,6 +176,17 @@ class RequestNotifyHandler final : public RequestHandler {
std::unordered_map<int, int64_t> decay_counters; std::unordered_map<int, int64_t> decay_counters;
}; };
class RequestSendAndRecvHandler final : public RequestHandler {
public:
explicit RequestSendAndRecvHandler(int distributed_mode)
: RequestHandler(distributed_mode) {}
virtual ~RequestSendAndRecvHandler() {}
bool Handle(const std::string& varname, framework::Scope* Scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
};
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -85,6 +85,12 @@ class RPCClient { ...@@ -85,6 +85,12 @@ class RPCClient {
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendAndRecv(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& send_var_name,
const std::string& recv_var_name, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendComplete( virtual VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
......
...@@ -35,27 +35,24 @@ namespace platform = paddle::platform; ...@@ -35,27 +35,24 @@ namespace platform = paddle::platform;
namespace distributed = paddle::operators::distributed; namespace distributed = paddle::operators::distributed;
USE_NO_KERNEL_OP(lookup_sparse_table_read); USE_NO_KERNEL_OP(lookup_sparse_table_read);
USE_OP(scale);
std::unique_ptr<distributed::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0); auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block); auto* block = program->AppendBlock(*root_block);
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); framework::OpDesc* op = block->AppendOp();
framework::VariableNameMap output({{"Output", {"out"}}}); op->SetType("scale");
auto op = block->AppendOp(); op->SetInput("X", {"x"});
op->SetType("lookup_sparse_table_read"); op->SetOutput("Out", {"res"});
op->SetInput("W", {"w"}); op->SetAttr("scale", 0.5f);
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"}); auto& out = *root_block->Var("res");
op->SetAttr("tablename", {"w"});
op->SetAttr("value_names", {"Param"});
auto& out = *root_block->Var("out");
out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetShape({10, 10}); out.SetShape({1, 10});
return block; return block;
} }
...@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { ...@@ -69,6 +66,12 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto ids_var = scope->Var("ids"); auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::LoDTensor>(); ids_var->GetMutable<framework::LoDTensor>();
auto x_var = scope->Var("x");
x_var->GetMutable<framework::LoDTensor>();
auto res_var = scope->Var("res");
res_var->GetMutable<framework::LoDTensor>();
} }
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
...@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, ...@@ -78,6 +81,11 @@ void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t* ids_ptr = int64_t* ids_ptr =
ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place); ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
float* x_ptr =
x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
} }
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
...@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) { ...@@ -124,6 +132,38 @@ void StartServer(const std::string& rpc_name) {
server_thread.join(); server_thread.join();
} }
void StartSendAndRecvServer(const std::string& rpc_name) {
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
auto block = AppendSendAndRecvBlock(&program);
std::string in_var_name("x");
std::vector<int> prefetch_block_ids{block->ID()};
auto prepared = exe.Prepare(program, prefetch_block_ids);
InitTensorsOnServer(&scope, &place, 10);
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
grad_to_prepared_ctx;
grad_to_prepared_ctx[in_var_name] = prepared[0];
g_req_handler->SetProgram(&program);
g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
g_req_handler->SetDevCtx(&ctx);
g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread(
std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join();
}
TEST(COMPLETE, CPU) { TEST(COMPLETE, CPU) {
setenv("http_proxy", "", 1); setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1); setenv("https_proxy", "", 1);
...@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) { ...@@ -147,3 +187,46 @@ TEST(COMPLETE, CPU) {
g_rpc_service.reset(nullptr); g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr); g_req_handler.reset(nullptr);
} }
TEST(SENDANDRECV, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
distributed::DistributedMode::kAsync));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
PADDLE_ENFORCE_NE(client, nullptr,
platform::errors::InvalidArgument(
"Client Start Fail, Check Your Code & Env"));
std::thread server_thread(StartSendAndRecvServer,
distributed::kRequestSendAndRecv);
g_rpc_service->WaitServerReady();
int port = g_rpc_service->GetSelectedPort();
std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
framework::Scope scope;
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// create var on local scope
int64_t rows_numel = 10;
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("x");
std::string out_var_name("res");
client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
client->Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[i], 0.5);
}
g_rpc_service->ShutDown();
server_thread.join();
LOG(INFO) << "begin reset";
g_rpc_service.reset(nullptr);
g_req_handler.reset(nullptr);
}
...@@ -29,7 +29,7 @@ service SendRecvService { ...@@ -29,7 +29,7 @@ service SendRecvService {
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
} }
......
...@@ -96,6 +96,13 @@ class VariableResponse { ...@@ -96,6 +96,13 @@ class VariableResponse {
return scope_->FindVar(meta_.varname()); return scope_->FindVar(meta_.varname());
} }
framework::Variable* GetRecvVar() {
if (create_scope_) {
return local_scope_->Var(meta_.out_varname());
}
return scope_->FindVar(meta_.out_varname());
}
int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); } int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
protected: protected:
......
...@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel { ...@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs("Ids"), PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
"Input(Ids) of LookupTableOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(Ids) of LookupTableOp should not be null."));
"Input(W) of LookupTableOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
PADDLE_ENFORCE(ctx->HasOutputs("Outputs"), platform::errors::InvalidArgument(
"Output(Outs) of LookupTableOp should not be null."); "Input(W) of LookupTableOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
platform::errors::InvalidArgument(
"Output(Outs) of LookupTableOp should not be null."));
auto ids_dims = ctx->GetInputsDim("Ids"); auto ids_dims = ctx->GetInputsDim("Ids");
auto table_dims = ctx->GetInputDim("W"); auto table_dims = ctx->GetInputDim("W");
PADDLE_ENFORCE_EQ(table_dims.size(), 2, PADDLE_ENFORCE_EQ(
"Only 2 dimensions of the 'Embedding' is supported."); table_dims.size(), 2,
platform::errors::InvalidArgument(
"Only 2 dimensions of the 'Embedding' is supported."));
for (auto &ids_dim : ids_dims) { for (auto &ids_dim : ids_dims) {
PADDLE_ENFORCE_EQ(ids_dim.size(), 2, PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
"The dimension of the 'Ids' tensor must be 2."); platform::errors::InvalidArgument(
"The dimension of the 'Ids' tensor must be 2."));
} }
auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints"); auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
// for fluid.embedding
auto lookup_table_version = auto lookup_table_version =
ctx->Attrs().Get<std::string>("lookup_table_version"); ctx->Attrs().Get<std::string>("lookup_table_version");
......
...@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> { ...@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto endpoints = context.Attr<std::vector<std::string>>("endpoints"); auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
auto is_distributed = context.Attr<bool>("is_distributed"); auto is_distributed = context.Attr<bool>("is_distributed");
auto lookup_table_version =
context.Attr<std::string>("lookup_table_version");
operators::distributed::prefetchs(id_names, out_names, embedding_name, operators::distributed::prefetchs(id_names, out_names, embedding_name,
is_distributed, lookup_tables, endpoints, is_distributed, lookup_tables, endpoints,
context, context.scope()); context, context.scope());
if (lookup_table_version == "lookup_table_v2") {
auto &scope = context.scope();
auto emb_dim =
scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
for (size_t i = 0; i < id_names.size(); ++i) {
auto *id_var = scope.FindVar(id_names[i]);
auto *out_var = scope.FindVar(out_names[i]);
auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto id_dims = id_tensor->dims();
out_tensor->Resize(framework::make_ddim(
{static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
static_cast<int64_t>(emb_dim)}));
}
}
} }
}; };
......
...@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -268,7 +268,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
std::vector<int> block_list; std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) { for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid); block_list.push_back(blkid);
...@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -295,6 +294,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
while (true) { while (true) {
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
...@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -394,6 +394,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
new distributed::RequestGetNoBarrierHandler()); new distributed::RequestGetNoBarrierHandler());
request_notify_handler_.reset( request_notify_handler_.reset(
new distributed::RequestNotifyHandler(distributed_mode, fan_in)); new distributed::RequestNotifyHandler(distributed_mode, fan_in));
request_send_and_recv_handler_.reset(
new distributed::RequestSendAndRecvHandler(distributed_mode));
rpc_service_->RegisterRPC(distributed::kRequestSend, rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get(), rpc_send_thread_num); request_send_handler_.get(), rpc_send_thread_num);
...@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -408,6 +410,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
request_get_no_barrier_handler_.get()); request_get_no_barrier_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestNotify, rpc_service_->RegisterRPC(distributed::kRequestNotify,
request_notify_handler_.get(), rpc_send_thread_num); request_notify_handler_.get(), rpc_send_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
request_send_and_recv_handler_.get(),
rpc_get_thread_num);
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
...@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -416,6 +421,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
"optimize blocks is less than 1. Optimize blocks " "optimize blocks is less than 1. Optimize blocks "
"should be 1 at least on the pserver side.")); "should be 1 at least on the pserver side."));
auto *program = optimize_blocks[0]->Program(); auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr; std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
...@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -488,6 +494,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
f(request_checkpoint_handler_.get()); f(request_checkpoint_handler_.get());
f(request_get_no_barrier_handler_.get()); f(request_get_no_barrier_handler_.get());
f(request_notify_handler_.get()); f(request_notify_handler_.get());
f(request_send_and_recv_handler_.get());
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
signal(SIGINT, SignalHandler::StopAndExit); signal(SIGINT, SignalHandler::StopAndExit);
......
...@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -99,6 +99,8 @@ class ListenAndServOp : public framework::OperatorBase {
mutable std::shared_ptr<distributed::RequestHandler> mutable std::shared_ptr<distributed::RequestHandler>
request_checkpoint_handler_; request_checkpoint_handler_;
mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
mutable std::shared_ptr<distributed::RequestHandler>
request_send_and_recv_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
mutable std::vector<std::string> sparse_vars_; mutable std::vector<std::string> sparse_vars_;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/operators/distributed/communicator_common.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SendAndRecvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope();
const auto& place = ctx.GetPlace();
auto send_var_name = ctx.Attr<std::string>("send_var_name");
auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
auto epmap = ctx.Attr<std::string>("endpoint");
auto trainer_id = ctx.Attr<int>("trainer_id");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& context = *pool.Get(place);
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
<< " Recv_var_name: " << recv_var_name;
distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
epmap, context, scope, send_var_name, recv_var_name);
rets->Wait();
}
};
class SendAndRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
AddAttr<std::string>("send_var_name", "Send Tensor's name")
.SetDefault(std::string(""));
AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
.SetDefault(std::string(""));
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::string>("endpoint", "Server endpoint")
.SetDefault({"127.0.0.1:6164"});
AddComment(R"DOC(
SendAndRecv operator
This operator will send variables to listen_and_serve op at the parameter server.
And recv variable from parameter server of send variable's scope.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
REGISTER_OP_CPU_KERNEL(
send_and_recv,
ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
...@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> { ...@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
auto out_vars = context.MultiOutputVar("Out"); auto out_vars = context.MultiOutputVar("Out");
for (size_t i = 0; i < out_var_names.size(); i++) { for (size_t i = 0; i < out_var_names.size(); i++) {
VLOG(4) << "loading tensor: " << out_var_names[i];
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_vars[i], platform::errors::InvalidArgument( out_vars[i], platform::errors::InvalidArgument(
"The variable %s to be loaded cannot be found.", "The variable %s to be loaded cannot be found.",
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/operators/lookup_table_v2_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
namespace paddle { namespace paddle {
...@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>, ...@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad, REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
ops::LookupTableV2GradKernel<float>, ops::LookupTableV2GradKernel<float>,
ops::LookupTableV2GradKernel<double>); ops::LookupTableV2GradKernel<double>);
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(lookup_table_v2)
.AddCheckpoint(
R"ROC(fix lookup_table_v2, add input type `int32`)ROC",
paddle::framework::compatible::OpVersionDesc()
.BugfixWithBehaviorChanged("lookup_table_v2 support input type "
"`int64`; after support input type "
"`int32/int64`"));
/* ========================================================================== */
...@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids, ...@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
} }
} }
template <typename T>
__global__ void InputTypeCovert(const T *in_ids, const int64_t K,
int64_t *out_ids) {
for (int i = 0; i < K; i++) {
out_ids[i] = (int64_t)(in_ids[i]);
}
}
template <typename T> template <typename T>
class LookupTableV2CUDAKernel : public framework::OpKernel<T> { class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> { ...@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
size_t D = table_t->dims()[1]; size_t D = table_t->dims()[1];
size_t K = ids_t->numel(); size_t K = ids_t->numel();
auto *ids = ids_t->data<int64_t>();
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
dim3 threads(256, 4); dim3 threads(256, 4);
dim3 grids(80, 1); dim3 grids(80, 1);
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> ids;
ids.resize(K);
const int64_t *ids_p = nullptr;
if (ids_t->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
ids_p = ids.MutableData(context.GetPlace());
} else {
ids_p = ids_t->data<int64_t>();
}
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
if (padding_idx == -1) if (padding_idx == -1)
LookupTableV2< LookupTableV2<
T, 256, 4, 80, T, 256, 4, 80,
false><<<grids, threads, 0, context.cuda_device_context().stream()>>>( false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx); output, table, ids_p, N, K, D, padding_idx);
else else
LookupTableV2< LookupTableV2<
T, 256, 4, 80, T, 256, 4, 80,
true><<<grids, threads, 0, context.cuda_device_context().stream()>>>( true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx); output, table, ids_p, N, K, D, padding_idx);
} }
}; };
...@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> { ...@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
dim3 threads(128, 8);
dim3 grids(8, 1);
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
// copy GPU memory to CPU pinned memory // copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.resize(ids_num); new_rows.resize(ids_num);
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
// TODO(yuyang18): Strange code here. if (ids->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids->data<int>(), ids_num,
new_rows.MutableData(context.GetPlace()));
} else {
memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
gpu_place, ids_data, ids_num * sizeof(int64_t), stream); gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
}
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
...@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> { ...@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
int N = d_table_t->dims()[0]; int N = d_table_t->dims()[0];
int D = d_table_t->dims()[1]; int D = d_table_t->dims()[1];
int K = ids_t->numel(); int K = ids_t->numel();
const int64_t *ids = ids_t->data<int64_t>();
dim3 threads(128, 8);
dim3 grids(8, 1);
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> ids;
ids.resize(K);
const int64_t *ids_p = nullptr;
if (ids_t->type() == framework::proto::VarType::INT32) {
InputTypeCovert<
int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
ids_p = ids.MutableData(context.GetPlace());
} else {
ids_p = ids_t->data<int64_t>();
}
const T *d_output = d_output_t->data<T>(); const T *d_output = d_output_t->data<T>();
T *d_table = d_table_t->mutable_data<T>(context.GetPlace()); T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t); auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0)); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
dim3 threads(128, 8);
dim3 grids(8, 1);
LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>( LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids, N, K, D); d_table, d_output, ids_p, N, K, D);
} }
} }
}; };
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -45,32 +46,19 @@ class LookupTableV2Kernel : public framework::OpKernel<T> { ...@@ -45,32 +46,19 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
auto *output_t = context.Output<LoDTensor>("Out"); // float tensor auto *output_t = context.Output<LoDTensor>("Out"); // float tensor
auto *table_var = context.InputVar("W"); auto *table_var = context.InputVar("W");
auto id_name = context.InputNames("Ids").front(); int64_t padding_idx = context.Attr<int64_t>("padding_idx");
auto embedding_name = context.InputNames("W").front(); int64_t ids_numel = ids_t->numel();
auto out_name = context.OutputNames("Out").front();
// for remote prefetch
auto epmap = context.Attr<std::vector<std::string>>("epmap");
auto remote_prefetch = context.Attr<bool>("remote_prefetch");
auto table_names = context.Attr<std::vector<std::string>>("table_names");
if (remote_prefetch && !epmap.empty()) { std::vector<int64_t> ids;
// if epmap is not empty, then the parameter will be fetched from remote ids.reserve(ids_numel);
// parameter server
#ifdef PADDLE_WITH_DISTRIBUTE if (ids_t->type() == framework::proto::VarType::INT32) {
operators::distributed::prefetch(id_name, out_name, embedding_name, false, std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_numel,
table_names, epmap, context, std::back_inserter(ids),
context.scope()); [&](int id) { return static_cast<int64_t>(id); });
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
} else { } else {
int64_t padding_idx = context.Attr<int64_t>("padding_idx"); framework::TensorToVector(*ids_t, &ids);
int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>()); }
int64_t ids_numel = ids_t->numel();
if (table_var->IsType<LoDTensor>()) { if (table_var->IsType<LoDTensor>()) {
auto *table_t = context.Input<LoDTensor>("W"); auto *table_t = context.Input<LoDTensor>("W");
...@@ -117,8 +105,8 @@ class LookupTableV2Kernel : public framework::OpKernel<T> { ...@@ -117,8 +105,8 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
"expected >= 0. But received %ld", "expected >= 0. But received %ld",
ids[i]); ids[i]);
auto id_index = table_t.Index(ids[i]); auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(id_index, 0,
id_index, 0, "the input key should be exists. But received %d.", "the input key should be exists. But received %d.",
id_index); id_index);
blas.VCOPY(row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width); output + i * row_width);
...@@ -126,7 +114,6 @@ class LookupTableV2Kernel : public framework::OpKernel<T> { ...@@ -126,7 +114,6 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
} }
} }
} }
}
}; };
template <typename T> template <typename T>
...@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> { ...@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
// Since paddings are not trainable and fixed in forward, the gradient of // Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward. // paddings makes no sense and we don't deal with it in backward.
if (is_sparse) { if (is_sparse) {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids_t = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
int64_t ids_num = ids_t->numel();
std::vector<int64_t> ids;
ids.reserve(ids_num);
auto *ids_data = ids->data<int64_t>(); if (ids_t->type() == framework::proto::VarType::INT32) {
int64_t ids_num = ids->numel(); std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
std::back_inserter(ids),
[&](int id) { return static_cast<int64_t>(id); });
} else {
framework::TensorToVector(*ids_t, &ids);
}
std::vector<int64_t> new_rows; d_table->set_rows(ids);
new_rows.resize(ids_num);
std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]}); d_table_value->Resize({ids_num, table_dim[1]});
...@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> { ...@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
} else { } else {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids_t = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W")); auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
int64_t ids_num = ids_t->numel();
std::vector<int64_t> ids;
ids.reserve(ids_num);
if (ids_t->type() == framework::proto::VarType::INT32) {
std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
std::back_inserter(ids),
[&](int id) { return static_cast<int64_t>(id); });
} else {
framework::TensorToVector(*ids_t, &ids);
}
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids.data();
int64_t N = table_dim[0]; int64_t N = table_dim[0];
int64_t D = table_dim[1]; int64_t D = table_dim[1];
...@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> { ...@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
memset(d_table_data, 0, d_table->numel() * sizeof(T)); memset(d_table_data, 0, d_table->numel() * sizeof(T));
for (int64_t i = 0; i < ids->numel(); ++i) { for (int64_t i = 0; i < ids_num; ++i) {
if (padding_idx != kNoPadding && ids_data[i] == padding_idx) { if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
// the gradient of padding_idx should be 0, already done by memset, so // the gradient of padding_idx should be 0, already done by memset, so
// do nothing. // do nothing.
......
...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static inline int SizeOutAxis(const int axis, DDim dims) {
int size = 1;
for (int i = axis + 1; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
template <typename T> template <typename T>
class SoftmaxCUDNNKernel : public framework::OpKernel<T> { class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = context.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* out_data = out->data<T>();
Out->mutable_data<T>(context.GetPlace());
auto dims = x->dims();
auto dims = X->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_x; const int dim = dims[axis];
framework::LoDTensor flattened_out; const int N = SizeToAxis(axis, dims);
flattened_x.ShareDataWith(*X).Resize(flattened_dims); const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
ScopedTensorDescriptor desc;
math::SoftmaxCUDNNFunctor<T>()( std::vector<int> tensor_dims = {N, dim, D, 1};
context.template device_context<platform::CUDADeviceContext>(), DataLayout layout = DataLayout::kNCHW;
&flattened_x, &flattened_out); cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data));
} }
}; };
template <typename T> template <typename T>
class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* Out = context.Input<Tensor>("Out"); auto* out = ctx.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
// allocate memory on device. auto* dx_data = dx->data<T>();
dX->mutable_data<T>(context.GetPlace());
auto dims = out->dims();
auto dims = Out->dims(); const int rank = dims.size();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
framework::LoDTensor flattened_out; const int dim = dims[axis];
framework::LoDTensor flattened_d_out; const int N = SizeToAxis(axis, dims);
framework::LoDTensor flattened_d_x; const int D = SizeOutAxis(axis, dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); ScopedTensorDescriptor desc;
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW;
math::SoftmaxGradCUDNNFunctor<T>()( cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
context.template device_context<platform::CUDADeviceContext>(),
&flattened_out, &flattened_d_out, &flattened_d_x); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
} }
}; };
......
...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
"Attr(axis) value should be in range [-R, R-1], " "Attr(axis) value should be in range [-R, R-1], "
"R is the rank of Input(X).")); "R is the rank of Input(X)."));
auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
if (axis != rank_x - 1 && axis != -1) {
PADDLE_ENFORCE_EQ(use_cudnn, false,
platform::errors::InvalidArgument(
"CUDNN kernel only support axis as -1."));
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
......
...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unsqueeze_op.h" #include "paddle/fluid/operators/unsqueeze_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp, ...@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>, unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL( ...@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL(
unsqueeze_grad, unsqueeze_grad,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>, ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>, ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>, ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>, ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>, unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL(
unsqueeze2_grad, unsqueeze2_grad,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>, ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>, ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>, ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>, ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>, unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>, ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>, plat::float16>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>, ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>, ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>, ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
plat::float16>, plat::float16>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>, ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>, ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -57,7 +57,11 @@ void BindFleetWrapper(py::module* m) { ...@@ -57,7 +57,11 @@ void BindFleetWrapper(py::module* m) {
.def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold) .def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold)
.def("cache_shuffle", &framework::FleetWrapper::CacheShuffle) .def("cache_shuffle", &framework::FleetWrapper::CacheShuffle)
.def("save_cache", &framework::FleetWrapper::SaveCache) .def("save_cache", &framework::FleetWrapper::SaveCache)
.def("save_model_with_whitelist",
&framework::FleetWrapper::SaveWithWhitelist)
.def("load_model", &framework::FleetWrapper::LoadModel) .def("load_model", &framework::FleetWrapper::LoadModel)
.def("load_table_with_whitelist",
&framework::FleetWrapper::LoadWithWhitelist)
.def("clear_model", &framework::FleetWrapper::ClearModel) .def("clear_model", &framework::FleetWrapper::ClearModel)
.def("clear_one_table", &framework::FleetWrapper::ClearOneTable) .def("clear_one_table", &framework::FleetWrapper::ClearOneTable)
.def("stop_server", &framework::FleetWrapper::StopServer) .def("stop_server", &framework::FleetWrapper::StopServer)
......
...@@ -334,8 +334,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { ...@@ -334,8 +334,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
} while (0) } while (0)
static void RegisterGlobalVarGetterSetter() { static void RegisterGlobalVarGetterSetter() {
REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_use_mkldnn, REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
FLAGS_free_idle_chunk,
FLAGS_free_when_no_cache_hit); FLAGS_free_when_no_cache_hit);
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
...@@ -349,7 +348,7 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -349,7 +348,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads); FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
......
...@@ -448,7 +448,6 @@ void BindAnalysisConfig(py::module *m) { ...@@ -448,7 +448,6 @@ void BindAnalysisConfig(py::module *m) {
&AnalysisConfig::cpu_math_library_num_threads) &AnalysisConfig::cpu_math_library_num_threads)
.def("to_native_config", &AnalysisConfig::ToNativeConfig) .def("to_native_config", &AnalysisConfig::ToNativeConfig)
.def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer) .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
.def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config, .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
py::return_value_policy::reference) py::return_value_policy::reference)
...@@ -566,7 +565,6 @@ void BindPaddlePassBuilder(py::module *m) { ...@@ -566,7 +565,6 @@ void BindPaddlePassBuilder(py::module *m) {
.def("enable_cudnn", &PassStrategy::EnableCUDNN) .def("enable_cudnn", &PassStrategy::EnableCUDNN)
.def("enable_mkldnn", &PassStrategy::EnableMKLDNN) .def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer) .def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
.def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
.def("use_gpu", &PassStrategy::use_gpu); .def("use_gpu", &PassStrategy::use_gpu);
py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy") py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
...@@ -574,16 +572,14 @@ void BindPaddlePassBuilder(py::module *m) { ...@@ -574,16 +572,14 @@ void BindPaddlePassBuilder(py::module *m) {
.def(py::init<const CpuPassStrategy &>()) .def(py::init<const CpuPassStrategy &>())
.def("enable_cudnn", &CpuPassStrategy::EnableCUDNN) .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
.def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN) .def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer) .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
.def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy") py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
.def(py::init<>()) .def(py::init<>())
.def(py::init<const GpuPassStrategy &>()) .def(py::init<const GpuPassStrategy &>())
.def("enable_cudnn", &GpuPassStrategy::EnableCUDNN) .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
.def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN) .def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
.def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer) .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
.def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
} }
} // namespace } // namespace
} // namespace pybind } // namespace pybind
......
...@@ -42,7 +42,6 @@ requirements: ...@@ -42,7 +42,6 @@ requirements:
- nltk - nltk
- scipy - scipy
- requests - requests
- pyyaml
- pillow - pillow
- graphviz - graphviz
- protobuf - protobuf
...@@ -62,7 +61,6 @@ requirements: ...@@ -62,7 +61,6 @@ requirements:
- nltk - nltk
- scipy - scipy
- requests - requests
- pyyaml
- pillow - pillow
- graphviz - graphviz
- protobuf - protobuf
...@@ -89,13 +87,11 @@ about: ...@@ -89,13 +87,11 @@ about:
pip install /package/objgraph-3.4.1.tar.gz pip install /package/objgraph-3.4.1.tar.gz
pip install /package/prettytable-0.7.tar.gz pip install /package/prettytable-0.7.tar.gz
pip install /package/rarfile-3.0.tar.gz --no-deps pip install /package/rarfile-3.0.tar.gz --no-deps
pip install /package/funcsigs-1.0.2.tar.gz
""" """
self.blt_const = r""" self.blt_const = r"""
pip install C:\package\objgraph-3.4.1.tar.gz pip install C:\package\objgraph-3.4.1.tar.gz
pip install C:\package\prettytable-0.7.tar.gz pip install C:\package\prettytable-0.7.tar.gz
pip install C:\package\funcsigs-1.0.2.tar.gz
pip install C:\package\rarfile-3.0.tar.gz --no-deps pip install C:\package\rarfile-3.0.tar.gz --no-deps
git clone https://github.com/PaddlePaddle/recordio.git git clone https://github.com/PaddlePaddle/recordio.git
cd recordio\python cd recordio\python
......
...@@ -19,10 +19,14 @@ rem ================================================= ...@@ -19,10 +19,14 @@ rem =================================================
rem Paddle CI Task On Windows Platform rem Paddle CI Task On Windows Platform
rem ================================================= rem =================================================
rem -------clean up environment-----------
set work_dir=%cd% set work_dir=%cd%
if exist build rmdir build /s/q if exist build rmdir build /s/q
mkdir build mkdir build
cd /d build cd /d build
tree .
dir paddle\fluid\pybind\Release
taskkill /f /im op_function_generator.exe 2>NUL
rem ------initialize the virtual environment------ rem ------initialize the virtual environment------
if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
...@@ -59,13 +63,12 @@ if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF ...@@ -59,13 +63,12 @@ if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
if not defined WITH_TPCACHE set WITH_TPCACHE=ON if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem ------set cache third_party------ rem ------set cache third_party------
set cache_dir=%work_dir%\..\cache set cache_dir=%work_dir:Paddle=cache%
dir %cache_dir% dir %cache_dir%
set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
if not exist %cache_dir%\tools ( if not exist %cache_dir%\tools (
git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
) )
if "%WITH_TPCACHE%"=="OFF" ( if "%WITH_TPCACHE%"=="OFF" (
...@@ -125,6 +128,8 @@ echo ======================================== ...@@ -125,6 +128,8 @@ echo ========================================
echo Step 1. Cmake ... echo Step 1. Cmake ...
echo ======================================== echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set start=%start:~4,10%
echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
...@@ -150,7 +155,7 @@ call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd6 ...@@ -150,7 +155,7 @@ call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd6
set build_times=1 set build_times=1
:build_tp :build_tp
echo Build third_party for %build_times% time: echo Build third_party the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
if %ERRORLEVEL% NEQ 0 ( if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1 set /a build_times=%build_times%+1
...@@ -165,7 +170,7 @@ echo Build third_party successfully! ...@@ -165,7 +170,7 @@ echo Build third_party successfully!
set build_times=1 set build_times=1
:build_paddle :build_paddle
echo Build Paddle for %build_times% time: echo Build Paddle the %build_times% time:
msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
if %ERRORLEVEL% NEQ 0 ( if %ERRORLEVEL% NEQ 0 (
set /a build_times=%build_times%+1 set /a build_times=%build_times%+1
...@@ -176,7 +181,9 @@ if %ERRORLEVEL% NEQ 0 ( ...@@ -176,7 +181,9 @@ if %ERRORLEVEL% NEQ 0 (
goto :build_paddle goto :build_paddle
) )
) )
echo Build Paddle successfully! echo Build Paddle successfully!
goto:eof goto:eof
:build_error :build_error
...@@ -189,6 +196,17 @@ rem ---------------------------------------------------------------------------- ...@@ -189,6 +196,17 @@ rem ----------------------------------------------------------------------------
echo ======================================== echo ========================================
echo Step 3. Test pip install whl package ... echo Step 3. Test pip install whl package ...
echo ======================================== echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10%
call :timestamp "%start%" "%end%" "Build"
tree /F %cd%\fluid_inference_install_dir\paddle
%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\fluid_inference_install_dir\paddle\lib > lib_size.txt
set /p libsize=< lib_size.txt
for /F %%i in ("%libsize%") do echo "Windows FLuid_Inference Size: %%i"
%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
set /p whlsize=< whl_size.txt
for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
dir /s /b python\dist\*.whl > whl_file.txt dir /s /b python\dist\*.whl > whl_file.txt
set /p PADDLE_WHL_FILE_WIN=< whl_file.txt set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
...@@ -215,6 +233,8 @@ echo ======================================== ...@@ -215,6 +233,8 @@ echo ========================================
echo Step 4. Running unit tests ... echo Step 4. Running unit tests ...
echo ======================================== echo ========================================
for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set start=%start:~4,10%
dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
...@@ -237,8 +257,11 @@ echo ======================================== ...@@ -237,8 +257,11 @@ echo ========================================
echo Step 5. Testing fluid library for inference ... echo Step 5. Testing fluid library for inference ...
echo ======================================== echo ========================================
cd %work_dir%\paddle\fluid\inference\api\demo_ci for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
set end=%end:~4,10%
call :timestamp "%start%" "%end%" "TestCases Total"
cd %work_dir%\paddle\fluid\inference\api\demo_ci
%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
goto:eof goto:eof
...@@ -253,7 +276,6 @@ echo ======================================== ...@@ -253,7 +276,6 @@ echo ========================================
echo Step 6. Check whether deleting a unit test ... echo Step 6. Check whether deleting a unit test ...
echo ======================================== echo ========================================
set PATH=%PYTHON_ROOT%;%PATH%
cd /d %work_dir%\build cd /d %work_dir%\build
echo set -ex> check_change_of_unittest.sh echo set -ex> check_change_of_unittest.sh
echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh
...@@ -325,6 +347,43 @@ call paddle_winci\Scripts\deactivate.bat 2>NUL ...@@ -325,6 +347,43 @@ call paddle_winci\Scripts\deactivate.bat 2>NUL
exit /b 1 exit /b 1
:timestamp
echo on
setlocal enabledelayedexpansion
set start=%~1
set dd=%start:~2,2%
set /a dd=100%dd%%%100
set hh=%start:~4,2%
set /a hh=100%hh%%%100
set nn=%start:~6,2%
set /a nn=100%nn%%%100
set ss=%start:~8,2%
set /a ss=100%ss%%%100
set /a start_sec=dd*86400+hh*3600+nn*60+ss
echo %start_sec%
set end=%~2
set dd=%end:~2,2%
set /a dd=100%dd%%%100
if %start:~0,2% NEQ %end:~0,2% (
set month_day=0
for %%i in (01 03 05 07 08 10 12) DO if %%i EQU %start:~0,2% set month_day=31
for %%i in (04 06 09 11) DO if %%i EQU %start:~0,2% set month_day=30
for %%i in (02) DO if %%i EQU %start:~0,2% set month_day=28
set /a dd=%dd%+!month_day!
)
set hh=%end:~4,2%
set /a hh=100%hh%%%100
set nn=%end:~6,2%
set /a nn=100%nn%%%100
set ss=%end:~8,2%
set /a ss=100%ss%%%100
set /a end_secs=dd*86400+hh*3600+nn*60+ss
set /a cost_secs=end_secs-start_sec
echo "Windows %~3 Time: %cost_secs%s"
goto:eof
rem --------------------------------------------------------------------------------------------- rem ---------------------------------------------------------------------------------------------
:success :success
echo ======================================== echo ========================================
...@@ -340,7 +399,7 @@ taskkill /f /im git-remote-https.exe 2>NUL ...@@ -340,7 +399,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
taskkill /f /im vctip.exe 2>NUL taskkill /f /im vctip.exe 2>NUL
taskkill /f /im cvtres.exe 2>NUL taskkill /f /im cvtres.exe 2>NUL
taskkill /f /im rc.exe 2>NUL taskkill /f /im rc.exe 2>NUL
taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe 2>NUL taskkill /f /im op_function_generator.exe 2>NUL
taskkill /f /im python.exe 2>NUL taskkill /f /im python.exe 2>NUL
call paddle_winci\Scripts\deactivate.bat 2>NUL call paddle_winci\Scripts\deactivate.bat 2>NUL
taskkill /f /im python.exe 2>NUL taskkill /f /im python.exe 2>NUL
......
...@@ -959,7 +959,7 @@ set +x ...@@ -959,7 +959,7 @@ set +x
retry_unittests_record="$retry_unittests_record$failed_test_lists" retry_unittests_record="$retry_unittests_record$failed_test_lists"
failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' ) read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
echo "=========================================" echo "========================================="
echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "This is the ${exec_time_array[$exec_times]} time to re-run"
echo "=========================================" echo "========================================="
...@@ -1395,6 +1395,26 @@ function example() { ...@@ -1395,6 +1395,26 @@ function example() {
fi fi
} }
function summary_check_problems() {
set +x
local check_style_code=$1
local example_code=$2
if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
echo "========================================"
echo "summary problems:"
echo "========================================"
if [ $check_style_code -ne 0 ];then
echo "- Check code style failed! Please check the log and fix problems."
fi
if [ $example_code -ne 0 ];then
echo "- Check example code failed! Please check the log and fix problems."
fi
[ $check_style_code -ne 0 ] && exit $check_style_code
[ $example_code -ne 0 ] && exit $example_code
fi
set -x
}
function main() { function main() {
local CMD=$1 local CMD=$1
local parallel_number=$2 local parallel_number=$2
...@@ -1407,12 +1427,15 @@ function main() { ...@@ -1407,12 +1427,15 @@ function main() {
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
;; ;;
build_and_check) build_and_check)
check_style $(check_style >&2)
check_style_code=$?
generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
check_sequence_op_unittest check_sequence_op_unittest
generate_api_spec ${PYTHON_ABI:-""} "PR" generate_api_spec ${PYTHON_ABI:-""} "PR"
example $(example >&2)
example_code=$?
summary_check_problems $check_style_code $example_code
assert_api_spec_approvals assert_api_spec_approvals
;; ;;
build) build)
......
...@@ -95,7 +95,6 @@ if (WITH_TESTING) ...@@ -95,7 +95,6 @@ if (WITH_TESTING)
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests) add_subdirectory(paddle/fluid/contrib/tests)
add_subdirectory(paddle/fluid/contrib/slim/tests) add_subdirectory(paddle/fluid/contrib/slim/tests)
add_subdirectory(paddle/incubate/hapi/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -256,12 +256,16 @@ from .device import get_device ...@@ -256,12 +256,16 @@ from .device import get_device
# from .tensor.tensor import LoDTensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS
# from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS # from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS
from . import incubate
from .incubate import hapi
from .fluid.dygraph.base import enable_dygraph as disable_static #DEFINE_ALIAS from .fluid.dygraph.base import enable_dygraph as disable_static #DEFINE_ALIAS
from .fluid.dygraph.base import disable_dygraph as enable_static #DEFINE_ALIAS from .fluid.dygraph.base import disable_dygraph as enable_static #DEFINE_ALIAS
from .fluid.framework import in_dygraph_mode as in_dynamic_mode #DEFINE_ALIAS from .fluid.framework import in_dygraph_mode as in_dynamic_mode #DEFINE_ALIAS
from .fluid.dygraph.base import no_grad #DEFINE_ALIAS from .fluid.dygraph.base import no_grad_ as no_grad #DEFINE_ALIAS
from . import jit from . import jit
from . import static from . import static
# high-level api
from .hapi import Model
from .hapi import callbacks
import paddle.text
import paddle.vision
...@@ -196,3 +196,14 @@ def cluster_files_reader(files_pattern, ...@@ -196,3 +196,14 @@ def cluster_files_reader(files_pattern,
yield line yield line
return reader return reader
def _check_exists_and_download(path, url, md5, module_name, download=True):
if path and os.path.exists(path):
return path
if download:
return paddle.dataset.common.download(url, module_name, md5)
else:
raise ValueError('{} not exists and auto download disabled'.format(
path))
...@@ -36,7 +36,7 @@ import tarfile ...@@ -36,7 +36,7 @@ import tarfile
import gzip import gzip
from collections import defaultdict from collections import defaultdict
import paddle.dataset.common import paddle
import paddle.compat as cpt import paddle.compat as cpt
__all__ = [ __all__ = [
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
# limitations under the License. # limitations under the License.
# TODO: define the functions to manipulate devices # TODO: define the functions to manipulate devices
import re
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid import framework from paddle.fluid import framework
import re from paddle.fluid.dygraph.parallel import ParallelEnv
__all__ = [ __all__ = [
'get_cudnn_version', 'get_cudnn_version',
...@@ -81,8 +83,8 @@ def set_device(device): ...@@ -81,8 +83,8 @@ def set_device(device):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_imperative() paddle.disable_static()
paddle.fluid.dygraph.set_device("gpu:0") paddle.set_device("cpu")
x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
data = paddle.stack([x1,x2], axis=1) data = paddle.stack([x1,x2], axis=1)
...@@ -90,18 +92,28 @@ def set_device(device): ...@@ -90,18 +92,28 @@ def set_device(device):
lower_device = device.lower() lower_device = device.lower()
if lower_device == 'cpu': if lower_device == 'cpu':
place = core.CPUPlace() place = core.CPUPlace()
framework._set_expected_place(place) elif lower_device == 'gpu':
if not core.is_compiled_with_cuda():
raise ValueError(
"The device should not be 'gpu', " \
"since PaddlePaddle is not compiled with CUDA")
place = core.CUDAPlace(ParallelEnv().dev_id)
else: else:
avaliable_device = ((lower_device == 'cpu') or avaliable_device = re.match(r'gpu:\d+', lower_device)
re.match(r'gpu:\d+', lower_device))
if not avaliable_device: if not avaliable_device:
raise ValueError( raise ValueError(
"The device must be a string which is like 'cpu' or 'gpu:0'") "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'"
)
if not core.is_compiled_with_cuda():
raise ValueError(
"The device should not be {}, since PaddlePaddle is " \
"not compiled with CUDA".format(avaliable_device))
device_info_list = device.split(':', 1) device_info_list = device.split(':', 1)
device_id = device_info_list[1] device_id = device_info_list[1]
device_id = int(device_id) device_id = int(device_id)
place = core.CUDAPlace(device_id) place = core.CUDAPlace(device_id)
framework._set_expected_place(place) framework._set_expected_place(place)
return place
def get_device(): def get_device():
...@@ -116,8 +128,8 @@ def get_device(): ...@@ -116,8 +128,8 @@ def get_device():
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_imperative() paddle.disable_static()
device = paddle.fluid.dygraph.get_device() device = paddle.get_device()
""" """
device = '' device = ''
......
...@@ -73,13 +73,14 @@ def broadcast(tensor, src, group=0): ...@@ -73,13 +73,14 @@ def broadcast(tensor, src, group=0):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np
import paddle import paddle
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]]) np_data = np.array([[4, 5, 6], [4, 5, 6]])
else: else:
np_data = np.array([[1, 2, 3], [1, 2, 3]]) np_data = np.array([[1, 2, 3], [1, 2, 3]])
...@@ -129,14 +130,15 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0): ...@@ -129,14 +130,15 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np
import paddle import paddle
from paddle.distributed import ReduceOp from paddle.distributed import ReduceOp
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]]) np_data = np.array([[4, 5, 6], [4, 5, 6]])
else: else:
np_data = np.array([[1, 2, 3], [1, 2, 3]]) np_data = np.array([[1, 2, 3], [1, 2, 3]])
...@@ -204,13 +206,14 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0): ...@@ -204,13 +206,14 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np
import paddle import paddle
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
np_data = np.array([[4, 5, 6], [4, 5, 6]]) np_data = np.array([[4, 5, 6], [4, 5, 6]])
else: else:
np_data = np.array([[1, 2, 3], [1, 2, 3]]) np_data = np.array([[1, 2, 3], [1, 2, 3]])
...@@ -286,14 +289,15 @@ def all_gather(tensor_list, tensor, group=0): ...@@ -286,14 +289,15 @@ def all_gather(tensor_list, tensor, group=0):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np
import paddle import paddle
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
tensor_list = [] tensor_list = []
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
np_data1 = np.array([[4, 5, 6], [4, 5, 6]]) np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
np_data2 = np.array([[4, 5, 6], [4, 5, 6]]) np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
data1 = paddle.to_tensor(np_data1) data1 = paddle.to_tensor(np_data1)
...@@ -304,7 +308,7 @@ def all_gather(tensor_list, tensor, group=0): ...@@ -304,7 +308,7 @@ def all_gather(tensor_list, tensor, group=0):
np_data2 = np.array([[1, 2, 3], [1, 2, 3]]) np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
data1 = paddle.to_tensor(np_data1) data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2) data2 = paddle.to_tensor(np_data2)
out = paddle.distributed.all_gather(tensor_list, data2) paddle.distributed.all_gather(tensor_list, data2)
""" """
op_type = 'c_allgather' op_type = 'c_allgather'
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
...@@ -359,13 +363,14 @@ def scatter(tensor, tensor_list=None, src=0, group=0): ...@@ -359,13 +363,14 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np
import paddle import paddle
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
np_data1 = np.array([7, 8, 9]) np_data1 = np.array([7, 8, 9])
np_data2 = np.array([10, 11, 12]) np_data2 = np.array([10, 11, 12])
else: else:
...@@ -373,7 +378,7 @@ def scatter(tensor, tensor_list=None, src=0, group=0): ...@@ -373,7 +378,7 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
np_data2 = np.array([4, 5, 6]) np_data2 = np.array([4, 5, 6])
data1 = paddle.to_tensor(np_data1) data1 = paddle.to_tensor(np_data1)
data2 = paddle.to_tensor(np_data2) data2 = paddle.to_tensor(np_data2)
if paddle.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
paddle.distributed.scatter(data1, src=1) paddle.distributed.scatter(data1, src=1)
else: else:
paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1) paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
...@@ -426,11 +431,11 @@ def barrier(group=0): ...@@ -426,11 +431,11 @@ def barrier(group=0):
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.prepare_context as prepare_context from paddle.distributed import init_parallel_env
paddle.disable_static() paddle.disable_static()
paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id) paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
prepare_context() init_parallel_env()
paddle.distributed.barrier() paddle.distributed.barrier()
""" """
op_type = 'barrier' op_type = 'barrier'
......
...@@ -22,8 +22,6 @@ from .runtime_factory import RuntimeFactory ...@@ -22,8 +22,6 @@ from .runtime_factory import RuntimeFactory
from .util_factory import UtilFactory from .util_factory import UtilFactory
from paddle.fluid.wrapped_decorator import wrap_decorator from paddle.fluid.wrapped_decorator import wrap_decorator
#__all__ = ['Fleet']
def _inited_runtime_handler_(func): def _inited_runtime_handler_(func):
def __impl__(*args, **kwargs): def __impl__(*args, **kwargs):
...@@ -43,65 +41,123 @@ inited_runtime_handler = wrap_decorator(_inited_runtime_handler_) ...@@ -43,65 +41,123 @@ inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
class Fleet(object): class Fleet(object):
""" """
Unified API for distributed training of PaddlePaddle Unified API for distributed training of PaddlePaddle
Please reference the https://github.com/PaddlePaddle/Fleet for details Please reference the https://github.com/PaddlePaddle/FleetX for details
Returns: Returns:
Fleet: A Fleet instance Fleet: A Fleet instance
Examples: Example for collective training:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(is_collective=True)
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
# do distributed training
Example for parameter server training:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
if fleet.is_first_worker(): if fleet.is_first_worker():
print("this is first worker") print("this is first worker")
print("current node index: {}".format(fleet.worker_index())) print("current node index: {}".format(fleet.worker_index()))
print("total number of worker num: {}".format(fleet.worker_num())) print("total number of worker num: {}".format(fleet.worker_num()))
if fleet.is_worker(): if fleet.is_worker():
print("this is worker") print("this is worker")
print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True))) print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
print("server num: {}".format(fleet.server_num())) print("server num: {}".format(fleet.server_num()))
print("server endpoints: {}".format(fleet.server_endpoints(to_string=True))) print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
if fleet.is_server(): if fleet.is_server():
print("this is server") print("this is server")
fleet.stop_worker() fleet.stop_worker()
""" """
def __init__(self): def __init__(self):
self._runtime_handle = None
self._util = None
self._role_maker = None self._role_maker = None
self.strategy_compiler = None
self._is_collective = False self._is_collective = False
self._runtime_handle = None
self._util = None
def init(self, role_maker=None, is_collective=False): def init(self, role_maker=None, is_collective=False):
""" """
Initialize role_maker in Fleet. Initialize role_maker in Fleet.
This function is responsible for the distributed architecture This function is responsible for the distributed architecture
what you want to run your code behind,such as Transpiler, what you want to run your code behind.
Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker
Args:
role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
of environment variables related to distributed training.If you did not initialize
the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
The default value is None.
is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
runs on the CPU or GPU. False means set distributed training using CPU, and True means
GPU.The default value is False.The default value is False.
Returns:
None
Examples1:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
Examples2:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init(is_collective=True)
Examples3:
.. code-block:: python
import paddle.distributed.fleet as fleet
role = fleet.PaddleCloudRoleMaker
fleet.init(role)
""" """
if isinstance(role_maker, RoleMakerBase):
self._role_maker = role_maker if role_maker is None:
elif role_maker == None:
if isinstance(is_collective, bool): if isinstance(is_collective, bool):
self._is_collective = is_collective self._is_collective = is_collective
self._role_maker = PaddleCloudRoleMaker( self._role_maker = PaddleCloudRoleMaker(
is_collective=self._is_collective) is_collective=self._is_collective)
else: else:
raise ValueError( raise ValueError(
"Something wrong occurred, please check whether is_collective is bool value" "`is_collective` should be instance of `bool`, but got {}".
) format(type(is_collective)))
else:
if isinstance(role_maker, RoleMakerBase):
self._role_maker = role_maker
else: else:
raise ValueError( raise ValueError(
"Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase" "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
) format(type(role_maker)))
self.strategy_compiler = StrategyCompiler() self.strategy_compiler = StrategyCompiler()
return None return None
...@@ -113,6 +169,14 @@ class Fleet(object): ...@@ -113,6 +169,14 @@ class Fleet(object):
bool: True if this is the first node of worker, bool: True if this is the first node of worker,
False if not. False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_first_worker()
""" """
return self._role_maker.is_first_worker() return self._role_maker.is_first_worker()
...@@ -122,6 +186,14 @@ class Fleet(object): ...@@ -122,6 +186,14 @@ class Fleet(object):
Returns: Returns:
int: node id int: node id
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_index()
""" """
return self._role_maker.worker_index() return self._role_maker.worker_index()
...@@ -131,6 +203,14 @@ class Fleet(object): ...@@ -131,6 +203,14 @@ class Fleet(object):
Returns: Returns:
int: worker numbers int: worker numbers
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_num()
""" """
return self._role_maker.worker_num() return self._role_maker.worker_num()
...@@ -141,15 +221,31 @@ class Fleet(object): ...@@ -141,15 +221,31 @@ class Fleet(object):
Returns: Returns:
bool: True if this is a node of worker, bool: True if this is a node of worker,
False if not. False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_worker()
""" """
return self._role_maker.is_worker() return self._role_maker.is_worker()
def worker_endpoints(self, to_string=False): def worker_endpoints(self, to_string=False):
""" """
Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"]. Get current worker endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
Returns: Returns:
list/string: server endpoints list/string: server endpoints
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.worker_endpoints()
""" """
''' '''
if to_string: if to_string:
...@@ -165,6 +261,12 @@ class Fleet(object): ...@@ -165,6 +261,12 @@ class Fleet(object):
Returns: Returns:
int: server number int: server number
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_num()
""" """
return len(self._role_maker.get_pserver_endpoints()) return len(self._role_maker.get_pserver_endpoints())
...@@ -174,6 +276,14 @@ class Fleet(object): ...@@ -174,6 +276,14 @@ class Fleet(object):
Returns: Returns:
int: node id int: node id
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_index()
""" """
return self._role_maker.server_index() return self._role_maker.server_index()
...@@ -183,14 +293,20 @@ class Fleet(object): ...@@ -183,14 +293,20 @@ class Fleet(object):
Returns: Returns:
list/string: server endpoints list/string: server endpoints
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.server_endpoints()
""" """
'''
if to_string: if to_string:
return ",".join(self._role_maker.get_pserver_endpoints()) return ",".join(self._role_maker.get_pserver_endpoints())
else: else:
return self._role_maker.get_pserver_endpoints() return self._role_maker.get_pserver_endpoints()
'''
return ["127.0.0.1:1001", "127.0.0.1:1002"]
def is_server(self): def is_server(self):
""" """
...@@ -199,14 +315,36 @@ class Fleet(object): ...@@ -199,14 +315,36 @@ class Fleet(object):
Returns: Returns:
bool: True if this is a node of server, bool: True if this is a node of server,
False if not. False if not.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
fleet.is_server()
""" """
return self._role_maker.is_server() return self._role_maker.is_server(
) or self._role_maker._is_heter_worker()
@property @property
def util(self): def util(self):
""" """
Utility functions that can be used under certain runtime Utility functions that can be used under certain runtime
return util return util
Returns:
UtilBase: instance of UtilBase, can use distributed ops/tools easily.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
util = fleet.util
files = ["1.log", "2.log", "3.log", "4.log"]
files = util.get_file_shard()
""" """
return self._util return self._util
...@@ -214,41 +352,114 @@ class Fleet(object): ...@@ -214,41 +352,114 @@ class Fleet(object):
def util(self, util): def util(self, util):
""" """
Set Utility functions for userd-defined runtime Set Utility functions for userd-defined runtime
set util
Returns:
None
""" """
self._util = util self._util = util
def barrier_worker(self): def barrier_worker(self):
""" """
barrier between workers barrier all workers
Returns:
None
""" """
self._role_maker.barrier_worker() self._role_maker.barrier_worker()
@inited_runtime_handler @inited_runtime_handler
def init_worker(self): def init_worker(self):
""" """
init worker initialize `Communicator` for parameter server training.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_worker()
""" """
self._runtime_handle._init_worker() self._runtime_handle._init_worker()
@inited_runtime_handler @inited_runtime_handler
def init_server(self, *args, **kwargs): def init_server(self, *args, **kwargs):
""" """
init server init_server executor to initialize startup program,
if the `args` is not empty, it will run load_persistables for increment training.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
""" """
self._runtime_handle._init_server(*args, **kwargs) self._runtime_handle._init_server(*args, **kwargs)
@inited_runtime_handler @inited_runtime_handler
def run_server(self): def run_server(self):
""" """
run server run server will run pserver main program with executor.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
if fleet.is_server():
fleet.init_server()
""" """
self._runtime_handle._run_server() self._runtime_handle._run_server()
@inited_runtime_handler @inited_runtime_handler
def stop_worker(self): def stop_worker(self):
""" """
stop worker stop `Communicator` and give training complete notice to parameter server.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
""" """
self._runtime_handle._stop_worker() self._runtime_handle._stop_worker()
...@@ -259,27 +470,98 @@ class Fleet(object): ...@@ -259,27 +470,98 @@ class Fleet(object):
target_vars, target_vars,
main_program=None, main_program=None,
export_for_deployment=True): export_for_deployment=True):
"""
save inference model for inference.
Returns:
None
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
# build net
# fleet.distributed_optimizer(...)
fleet.init_server()
"""
self._runtime_handle._save_inference_model( self._runtime_handle._save_inference_model(
executor, dirname, feeded_var_names, target_vars, main_program, executor, dirname, feeded_var_names, target_vars, main_program,
export_for_deployment) export_for_deployment)
def save_persistables(self, executor, dirname, main_program=None): def save_persistables(self, executor, dirname, main_program=None):
"""
saves all persistable variables from :code:`main_program` to
the folder :code:`dirname`. You can refer to
The :code:`dirname` is used to specify the folder where persistable variables
are going to be saved. If you would like to save variables in separate
files, set :code:`filename` None.
Args:
executor(Executor): The executor to run for saving persistable variables.
You can refer to :ref:`api_guide_executor_en` for
more details.
dirname(str, optional): The saving directory path.
When you need to save the parameter to the memory, set it to None.
main_program(Program, optional): The program whose persistbale variables will
be saved. Default: None.
Returns:
None
Examples:
.. code-block:: text
import paddle.distributed.fleet as fleet
import paddle.fluid as fluid
fleet.init()
# build net
# fleet.distributed_optimizer(...)
exe = fluid.Executor(fluid.CPUPlace())
fleet.save_persistables(exe, "dirname", fluid.default_main_program())
"""
self._runtime_handle._save_persistables(executor, dirname, main_program) self._runtime_handle._save_persistables(executor, dirname, main_program)
def distributed_optimizer(self, optimizer, strategy=None): def distributed_optimizer(self, optimizer, strategy=None):
""" """
distirbuted_optimizer Optimizer for distributed training.
For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
Which has basic Optimizer function and special features for distributed training.
Args:
optimizer(Optimizer): The executor to run for init server.
strategy(DistributedStrategy): Extra properties for distributed optimizer.
Returns: Returns:
Fleet instance with minimize interface like optimizers Fleet: instance of fleet.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True) role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = paddle.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
""" """
self.user_defined_optimizer = optimizer self.user_defined_optimizer = optimizer
if strategy == None: if strategy == None:
...@@ -316,14 +598,16 @@ class Fleet(object): ...@@ -316,14 +598,16 @@ class Fleet(object):
``fetch_list`` before run, see details in ``Executor``. ``fetch_list`` before run, see details in ``Executor``.
Examples: Examples:
.. code-block:: python
import paddle import paddle
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh') fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh')
fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh')
prediction = paddle.layers.fc(input=[fc_2], size=label_dim, act='softmax') prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax')
cost = paddle.layers.cross_entropy(input=prediction, label=input_y) cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.layers.mean(x=cost) avg_cost = paddle.fluid.layers.mean(x=cost)
role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True) role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
...@@ -332,7 +616,7 @@ class Fleet(object): ...@@ -332,7 +616,7 @@ class Fleet(object):
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
# for more examples, please reference https://github.com/PaddlePaddle/Fleet # for more examples, please reference https://github.com/PaddlePaddle/FleetX
""" """
context = {} context = {}
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""Defination of Role Makers.""" """Defination of Role Makers."""
import os import os
import numpy as np import numpy as np
import warnings
from multiprocessing import Process, Manager from multiprocessing import Process, Manager
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -23,6 +24,7 @@ import paddle.fluid as fluid ...@@ -23,6 +24,7 @@ import paddle.fluid as fluid
class Role: class Role:
WORKER = 1 WORKER = 1
SERVER = 2 SERVER = 2
HETER_WORKER = 3
class RoleMakerBase(object): class RoleMakerBase(object):
...@@ -40,6 +42,11 @@ class RoleMakerBase(object): ...@@ -40,6 +42,11 @@ class RoleMakerBase(object):
self._role = None self._role = None
self._current_id = -1 self._current_id = -1
# for heter parameter server mode
self._heter_trainer_endpoints = []
self._heter_trainer_device = "CPU"
self._is_heter_parameter_server_mode = False
self._node_type = None self._node_type = None
self._node_type_comm = None self._node_type_comm = None
self._all_comm = None self._all_comm = None
...@@ -163,12 +170,58 @@ class RoleMakerBase(object): ...@@ -163,12 +170,58 @@ class RoleMakerBase(object):
""" """
print("warning: RoleMakerBase does not have barrier worker.") print("warning: RoleMakerBase does not have barrier worker.")
def _is_heter_worker(self):
"""
Return is_heter_worker() of current process
"""
warnings.warn("RoleMakerBase does not have function: _is_heter_worker.")
return False
def _heter_worker_num(self):
"""
Get current total heter-worker number.
Returns:
int: heter_worker number
"""
warnings.warn(
"RoleMakerBase does not have function: _heter_worker_num.")
return 0
def _get_heter_worker_endpoints(self):
"""
Returns:
string: all heter_trainers'endpoints
"""
assert self._heter_trainer_endpoints != []
return self._heter_trainer_endpoints
def _get_heter_worker_endpoint(self):
"""
Returns:
int: corresponding heter_trainer's endpoint
e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
"""
assert self._heter_trainer_endpoints != []
return self._heter_trainer_endpoints[(self._current_id + 1) %
self._heter_worker_num()]
def _get_heter_worker_device(self):
"""
Returns:
string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
"""
return self._heter_trainer_device.upper()
class PaddleCloudRoleMaker(RoleMakerBase): class PaddleCloudRoleMaker(RoleMakerBase):
def __init__(self, is_collective=False, **kwargs): def __init__(self, is_collective=False, **kwargs):
super(PaddleCloudRoleMaker, self).__init__() super(PaddleCloudRoleMaker, self).__init__()
self._is_collective = is_collective self._is_collective = is_collective
self._init_gloo = False #default no init gloo self._init_gloo = False # default no init gloo
self._kwargs = kwargs self._kwargs = kwargs
self._role_is_generated = False self._role_is_generated = False
...@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -278,10 +331,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
""" """
get index of current node get index of current node
""" """
if self.is_server(): return self._current_id
return self.server_index()
elif self.is_worker():
return self.worker_index()
def worker_num(self): def worker_num(self):
""" """
...@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -323,6 +373,22 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self.generate_role() self.generate_role()
return self._server_endpoints return self._server_endpoints
def _heter_worker_num(self):
"""
get heter worker nums
"""
if not self._role_is_generated:
self.generate_role()
return self._heter_trainers_num
def _is_heter_worker(self):
"""
whether current process is heter worker
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.HETER_WORKER
def _get_rank(self): def _get_rank(self):
""" """
get current rank in all workers and pservers get current rank in all workers and pservers
...@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -342,17 +408,47 @@ class PaddleCloudRoleMaker(RoleMakerBase):
def _ps_env(self): def _ps_env(self):
try: try:
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port), eg. 127.0.0.1:6001 # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
self._server_endpoints = os.environ[ self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
"PADDLE_PSERVERS_IP_PORT_LIST"].split(",") "").split(",")
assert self._server_endpoints != ""
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",") "").split(",")
assert self._server_endpoints != ""
trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"]) trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
training_role = os.environ["TRAINING_ROLE"] training_role = os.environ["TRAINING_ROLE"]
if training_role not in ["TRAINER", "PSERVER"]: if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER") raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
format(training_role))
# For heter parameter server env setting
heter_trainer_eplist = os.getenv(
"PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
None)
if heter_trainer_eplist and heter_trainer_device:
try:
heter_trainer_eplist = os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
except:
raise ValueError(
"Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
)
self._is_heter_parameter_server_mode = True
heter_trainers_num = len(heter_trainer_eplist)
current_node_device = heter_trainer_device.upper()
if current_node_device not in ["CPU", "GPU", "XPU"]:
raise ValueError(
"Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
format(heter_trainer_device))
self._heter_trainer_device = current_node_device
else:
self._is_heter_parameter_server_mode = False
heter_trainers_num = 0
if training_role == "TRAINER": if training_role == "TRAINER":
role = Role.WORKER role = Role.WORKER
...@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -365,17 +461,26 @@ class PaddleCloudRoleMaker(RoleMakerBase):
ip = os.environ["POD_IP"] ip = os.environ["POD_IP"]
self._cur_endpoint = ip + ":" + port self._cur_endpoint = ip + ":" + port
current_id = self._server_endpoints.index(self._cur_endpoint) current_id = self._server_endpoints.index(self._cur_endpoint)
elif training_role == "HETER_TRAINER":
role = Role.HETER_WORKER
cur_ip = os.environ["POD_IP"]
cur_port = os.environ["PADDLE_PORT"]
curr_endpoint = ":".join([cur_ip, cur_port])
current_id = heter_trainer_eplist.index(curr_endpoint)
else: else:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
except ValueError as ve:
raise ValueError( raise ValueError(
"something wrong with PaddleCloud, please check environment") "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
except ValueError as e:
raise ValueError(
"Something wrong with PaddleCloud, please check environment")
self._trainers_num = trainers_num self._trainers_num = trainers_num
self._role = role self._role = role
self._current_id = current_id self._current_id = current_id
self._node_num = len( self._node_num = len(
set([x.split(':')[0] for x in self._worker_endpoints])) set([x.split(':')[0] for x in self._worker_endpoints]))
self._heter_trainers_num = heter_trainers_num
self._heter_trainer_endpoints = heter_trainer_eplist
def _collective_env(self): def _collective_env(self):
self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
......
...@@ -15,10 +15,10 @@ from .amp_optimizer import AMPOptimizer ...@@ -15,10 +15,10 @@ from .amp_optimizer import AMPOptimizer
from .recompute_optimizer import RecomputeOptimizer from .recompute_optimizer import RecomputeOptimizer
from .gradient_merge_optimizer import GradientMergeOptimizer from .gradient_merge_optimizer import GradientMergeOptimizer
from .graph_execution_optimizer import GraphExecutionOptimizer from .graph_execution_optimizer import GraphExecutionOptimizer
from .async_optimizer import AsyncMetaOptimizer from .parameter_server_optimizer import ParameterServerOptimizer
from .pipeline_optimizer import PipelineOptimizer from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer from .localsgd_optimizer import LocalSGDOptimizer
from .lars_optimizer import LarsOptimizer from .lars_optimizer import LarsOptimizer
from .async_graph_execution_optimizer import AsyncGraphExecutionOptimizer from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
from .dgc_optimizer import DGCOptimizer from .dgc_optimizer import DGCOptimizer
from .lamb_optimizer import LambOptimizer from .lamb_optimizer import LambOptimizer
...@@ -13,12 +13,12 @@ ...@@ -13,12 +13,12 @@
from paddle import fluid from paddle import fluid
from paddle.fluid import compiler from paddle.fluid import compiler
from .async_optimizer import AsyncMetaOptimizer from .parameter_server_optimizer import ParameterServerOptimizer
class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer): class ParameterServerGraphOptimizer(ParameterServerOptimizer):
def __init__(self, optimizer): def __init__(self, optimizer):
super(AsyncGraphExecutionOptimizer, self).__init__(optimizer) super(ParameterServerGraphOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently # we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [] self.meta_optimizers_white_list = []
...@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer): ...@@ -31,6 +31,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer):
if self.role_maker.is_server(): if self.role_maker.is_server():
return False return False
if self.role_maker._is_heter_parameter_server_mode:
return False
return True return True
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
......
...@@ -15,9 +15,9 @@ from paddle import fluid ...@@ -15,9 +15,9 @@ from paddle import fluid
from .meta_optimizer_base import MetaOptimizerBase from .meta_optimizer_base import MetaOptimizerBase
class AsyncMetaOptimizer(MetaOptimizerBase): class ParameterServerOptimizer(MetaOptimizerBase):
def __init__(self, optimizer): def __init__(self, optimizer):
super(AsyncMetaOptimizer, self).__init__(optimizer) super(ParameterServerOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently # we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [] self.meta_optimizers_white_list = []
...@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase): ...@@ -68,6 +68,21 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
_startup = worker.init_from_server_pass(_startup, compiled_config) _startup = worker.init_from_server_pass(_startup, compiled_config)
_startup = worker.delet_extra_optimizes_pass(_startup, _startup = worker.delet_extra_optimizes_pass(_startup,
compiled_config) compiled_config)
# for heter program
if self.role_maker._is_heter_parameter_server_mode:
from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
if self.role_maker._is_heter_worker():
# for heter worker
_main = heter_worker.split_heter_worker_ops_pass(
_main, compiled_config)
else:
# for default worker
_main = heter_worker.split_trainer_ops_pass(_main,
compiled_config)
# for startup change
_startup = heter_worker.delete_startup_useless_ops_var_pass(
_startup, _main, compiled_config)
else: else:
_main = worker.append_send_ops_pass(_main, compiled_config) _main = worker.append_send_ops_pass(_main, compiled_config)
_startup = _startup _startup = _startup
...@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase): ...@@ -129,9 +144,12 @@ class AsyncMetaOptimizer(MetaOptimizerBase):
_origin_startup_program, _origin_startup_program,
strategy, self.role_maker) strategy, self.role_maker)
main_program, startup_program = \ if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
self._build_trainer_programs(compiled_config) if self.role_maker.is_worker() \ main_program, startup_program = self._build_trainer_programs(
else self._build_pserver_programs(compiled_config) compiled_config)
elif self.role_maker.is_server():
main_program, startup_program = self._build_pserver_programs(
compiled_config)
loss.block.program = main_program loss.block.program = main_program
fluid.framework.switch_startup_program(startup_program) fluid.framework.switch_startup_program(startup_program)
......
...@@ -154,15 +154,16 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -154,15 +154,16 @@ class ParameterServerRuntime(RuntimeBase):
kwargs["sparse_attrs"] = get_sparse_attrs() kwargs["sparse_attrs"] = get_sparse_attrs()
return kwargs return kwargs
from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
SyncStrategy, GeoStrategy SyncStrategy, GeoStrategy
trainer_config = self.async_strategy.get_trainer_runtime_config() trainer_config = self.async_strategy.get_trainer_runtime_config()
lrs = _get_lr_ops(self.origin_main_program)
if len(lrs) > 0: lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
if lrs:
kwargs = {"need_global_step": "1"} kwargs = {"need_global_step": "1"}
else: else:
kwargs = {"need_global_step": "0"} kwargs = {"need_global_step": "0"}
...@@ -196,6 +197,18 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -196,6 +197,18 @@ class ParameterServerRuntime(RuntimeBase):
else: else:
warnings.warn("communicator has been initialized, skip") warnings.warn("communicator has been initialized, skip")
def _get_executor(self):
if self.role_maker._is_heter_worker():
if self.role_maker._get_heter_worker_device() == "GPU":
gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
executor = Executor(fluid.CUDAPlace(gpu_id))
else:
raise ValueError("Not Support Device {}".format(
self.role_maker._get_heter_worker_device()))
else:
executor = fluid.Executor(fluid.CPUPlace())
return executor
def _init_server(self, *args, **kwargs): def _init_server(self, *args, **kwargs):
if len(args) > 1: if len(args) > 1:
raise ValueError("init server can only accept 1 args: `dirname`") raise ValueError("init server can only accept 1 args: `dirname`")
...@@ -204,9 +217,15 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -204,9 +217,15 @@ class ParameterServerRuntime(RuntimeBase):
else: else:
model_dirname = None model_dirname = None
executor = fluid.Executor(fluid.CPUPlace()) if self.role_maker._is_heter_worker():
self._init_worker()
executor = self._get_executor()
executor.run(fluid.default_startup_program()) executor.run(fluid.default_startup_program())
if self.role_maker._is_heter_worker():
return
if not model_dirname: if not model_dirname:
return return
...@@ -237,12 +256,12 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -237,12 +256,12 @@ class ParameterServerRuntime(RuntimeBase):
# self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames) # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
def _run_server(self): def _run_server(self):
executor = fluid.Executor(fluid.CPUPlace()) executor = self._get_executor()
executor.run(fluid.default_main_program()) executor.run(fluid.default_main_program())
def _stop_worker(self): def _stop_worker(self):
self._communicator.stop() self._communicator.stop()
executor = fluid.Executor(fluid.CPUPlace()) executor = self._get_executor()
executor.close() executor.close()
def _get_optimizer_status(self, op, param_name): def _get_optimizer_status(self, op, param_name):
......
...@@ -29,13 +29,13 @@ __all__ = ["init_parallel_env"] ...@@ -29,13 +29,13 @@ __all__ = ["init_parallel_env"]
ParallelStrategy = core.ParallelStrategy ParallelStrategy = core.ParallelStrategy
def init_parallel_env(backend='nccl'): def init_parallel_env():
""" """
Initialize parallel training environments in dynamic mode. Initialize parallel training environment in dynamic graph mode.
Args: .. note::
backend(str, optional): The backend to communication between multiple devices. Now only supports initializing the GPU parallel training
Now only support ``nccl`` . Default value is ``nccl`` . environment and using NCCL for communication.
Returns: Returns:
None None
...@@ -89,14 +89,12 @@ def init_parallel_env(backend='nccl'): ...@@ -89,14 +89,12 @@ def init_parallel_env(backend='nccl'):
dist.spawn(train) dist.spawn(train)
""" """
# 1. input check # 1. gpu check
if not isinstance(backend, six.string_types): if not core.is_compiled_with_cuda():
raise TypeError("input `backend` type error, expected type is str, " raise NotImplementedError(
"but received type is %s." % type(backend)) "Cannot initialize parallel environment in CPU-only version, now only "
if cpt.to_text(backend) != 'nccl': "supports initializing the GPU parallel environment. Please recompile "
raise ValueError( "or reinstall paddle with GPU support.")
"backend `%s` is not supported, now only supports `nccl` backend." %
backend)
# 2. check env # 2. check env
def _check_var_exists(var_name): def _check_var_exists(var_name):
...@@ -112,9 +110,8 @@ def init_parallel_env(backend='nccl'): ...@@ -112,9 +110,8 @@ def init_parallel_env(backend='nccl'):
_check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINERS_NUM")
_check_var_exists("PADDLE_TRAINER_ENDPOINTS") _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
# 3. init ParallelStrategy # 3. init NCCL ParallelStrategy
strategy = ParallelStrategy() strategy = ParallelStrategy()
if cpt.to_text(backend) == 'nccl':
if parallel_helper._is_parallel_ctx_initialized(): if parallel_helper._is_parallel_ctx_initialized():
warnings.warn("The parallel environment has been initialized.") warnings.warn("The parallel environment has been initialized.")
strategy.nranks = ParallelEnv().world_size strategy.nranks = ParallelEnv().world_size
...@@ -133,8 +130,7 @@ def init_parallel_env(backend='nccl'): ...@@ -133,8 +130,7 @@ def init_parallel_env(backend='nccl'):
_set_expected_place(place) _set_expected_place(place)
# init nccl context # init nccl context
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place))
core.NCCLParallelContext(strategy, place))
parallel_helper._init_parallel_ctx() parallel_helper._init_parallel_ctx()
...@@ -163,7 +159,7 @@ def get_rank(): ...@@ -163,7 +159,7 @@ def get_rank():
def get_world_size(): def get_world_size():
""" """
The number of trainers (number of processes participating in current job). Returns the number of trainers (number of processes participating in current job).
Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` .
The default value is 1. The default value is 1.
......
...@@ -236,8 +236,6 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): ...@@ -236,8 +236,6 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
func (function): The target function is called by spawned process. func (function): The target function is called by spawned process.
This function need to be able to pickled, so it must be defined This function need to be able to pickled, so it must be defined
at the top level of a module. at the top level of a module.
This function should be called as ``func(i, *args)``, ``i`` is
the process index and ``args`` contains other arguments as tuple.
args (tuple, optional): Arguments passed to ``func``. args (tuple, optional): Arguments passed to ``func``.
nprocs (int, optional): Number of processed to start. Default: -1. nprocs (int, optional): Number of processed to start. Default: -1.
when nprocs is -1, the available device will be obtained from when nprocs is -1, the available device will be obtained from
...@@ -246,8 +244,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): ...@@ -246,8 +244,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
CPU number is obtained from the environment variable CPU_NUM. CPU number is obtained from the environment variable CPU_NUM.
For example, export CPU_NUM=4, if the environment variable is not set, For example, export CPU_NUM=4, if the environment variable is not set,
the executor will add the variable to the environment variable and the spawn method will add default value to the environment variable
set its value to 1. and set its value to 1.
join (bool, optional): Perform a blocking join on all spawned processes. join (bool, optional): Perform a blocking join on all spawned processes.
Default: True. Default: True.
daemon (bool, optional): The spawned processes' daemon flag. Default: False. daemon (bool, optional): The spawned processes' daemon flag. Default: False.
...@@ -266,8 +264,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): ...@@ -266,8 +264,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
such as 6170. Default: None; such as 6170. Default: None;
(5) selected_gpus (string): The training process will run on the (5) selected_gpus (string): The training process will run on the
selected_gpus, such as "0,1,2,3". Default: None; selected_gpus, such as "0,1,2,3". Default: None;
(6) print_config: Print current parallel training config. Default: False; (6) print_config (bool): Print current parallel training config. Default: False;
(7) use_paddlecloud: Whether to use paddlecloud platform to run your (7) use_paddlecloud (bool): Whether to use paddlecloud platform to run your
multi-process job. Default: False. multi-process job. Default: False.
Returns: Returns:
......
...@@ -129,7 +129,7 @@ class GradientClipBase(object): ...@@ -129,7 +129,7 @@ class GradientClipBase(object):
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
@imperative_base.no_grad() @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
raise NotImplementedError raise NotImplementedError
...@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase): ...@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
def __str__(self): def __str__(self):
return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max) return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
@imperative_base.no_grad() @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
params_and_grads = [] params_and_grads = []
for p, g in params_grads: for p, g in params_grads:
...@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase): ...@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
def __str__(self): def __str__(self):
return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
@imperative_base.no_grad() @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
params_and_grads = [] params_and_grads = []
for p, g in params_grads: for p, g in params_grads:
...@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase): ...@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
def __str__(self): def __str__(self):
return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
@imperative_base.no_grad() @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
params_and_grads = [] params_and_grads = []
sum_square_list = [] sum_square_list = []
......
...@@ -69,7 +69,7 @@ class ImperativeQuantAware(object): ...@@ -69,7 +69,7 @@ class ImperativeQuantAware(object):
from paddle.fluid.contrib.slim.quantization \ from paddle.fluid.contrib.slim.quantization \
import ImperativeQuantAware import ImperativeQuantAware
from paddle.incubate.hapi.vision.models \ from paddle.vision.models \
import resnet import resnet
model = resnet.resnet50(pretrained=True) model = resnet.resnet50(pretrained=True)
......
...@@ -16,10 +16,12 @@ from __future__ import print_function ...@@ -16,10 +16,12 @@ from __future__ import print_function
from __future__ import division from __future__ import division
import numpy as np import numpy as np
import math
from .sampler import Sampler, SequenceSampler, RandomSampler from .sampler import Sampler, SequenceSampler, RandomSampler
from .dataset import Dataset, IterableDataset from .dataset import Dataset, IterableDataset
__all__ = ["BatchSampler"] __all__ = ["BatchSampler", "DistributedBatchSampler"]
class BatchSampler(Sampler): class BatchSampler(Sampler):
...@@ -158,3 +160,185 @@ class _InfiniteIterableSampler(object): ...@@ -158,3 +160,185 @@ class _InfiniteIterableSampler(object):
def __iter__(self): def __iter__(self):
while True: while True:
yield [None] * self.batch_size yield [None] * self.batch_size
class DistributedBatchSampler(BatchSampler):
"""Sampler that restricts data loading to a subset of the dataset.
In such case, each process can pass a DistributedBatchSampler instance
as a DataLoader sampler, and load a subset of the original dataset that
is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Args:
dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
or other python object which implemented
`__len__` for BatchSampler to get sample
number of data source.
batch_size(int): sample indice number in a mini-batch indices.
num_replicas(int, optional): porcess number in distributed training.
If :attr:`num_replicas` is None, :attr:`num_replicas` will be
retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
Default None.
rank(int, optional): the rank of the current process among :attr:`num_replicas`
processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
:code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
shuffle(bool): whther to shuffle indices order before genrating
batch indices. Default False.
drop_last(bool): whether drop the last incomplete batch dataset size
is not divisible by the batch size. Default False
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for data in sampler:
# do something
break
"""
def __init__(self,
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False):
self.dataset = dataset
assert isinstance(batch_size, int) and batch_size > 0, \
"batch_size should be a positive integer"
self.batch_size = batch_size
assert isinstance(shuffle, bool), \
"shuffle should be a boolean value"
self.shuffle = shuffle
assert isinstance(drop_last, bool), \
"drop_last should be a boolean number"
from paddle.fluid.dygraph.parallel import ParallelEnv
if num_replicas is not None:
assert isinstance(num_replicas, int) and num_replicas > 0, \
"num_replicas should be a positive integer"
self.nranks = num_replicas
else:
self.nranks = ParallelEnv().nranks
if rank is not None:
assert isinstance(rank, int) and rank >= 0, \
"rank should be a non-negative integer"
self.local_rank = rank
else:
self.local_rank = ParallelEnv().local_rank
self.drop_last = drop_last
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
self.total_size = self.num_samples * self.nranks
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
if self.shuffle:
np.random.RandomState(self.epoch).shuffle(indices)
self.epoch += 1
# subsample
def _get_indices_by_batch_size(indices):
subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
assert last_batch_size % self.nranks == 0
last_local_batch_size = last_batch_size // self.nranks
for i in range(self.local_rank * self.batch_size,
len(indices) - last_batch_size,
self.batch_size * self.nranks):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
subsampled_indices.extend(indices[
self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
if self.nranks > 1:
indices = _get_indices_by_batch_size(indices)
assert len(indices) == self.num_samples
_sample_iter = iter(indices)
batch_indices = []
for idx in _sample_iter:
batch_indices.append(idx)
if len(batch_indices) == self.batch_size:
yield batch_indices
batch_indices = []
if not self.drop_last and len(batch_indices) > 0:
yield batch_indices
def __len__(self):
num_samples = self.num_samples
num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size
def set_epoch(self, epoch):
"""
Sets the epoch number. When :attr:`shuffle=True`, this number is used
as seeds of random numbers. By default, users may not set this, all
replicas (workers) use a different random ordering for each epoch.
If set same number at each epoch, this sampler will yield the same
ordering at all epoches.
Arguments:
epoch (int): Epoch number.
Examples:
.. code-block:: python
import numpy as np
from paddle.io import Dataset, DistributedBatchSampler
# init with dataset
class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
dataset = RandomDataset(100)
sampler = DistributedBatchSampler(dataset, batch_size=64)
for epoch in range(10):
sampler.set_epoch(epoch)
"""
self.epoch = epoch
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
import inspect
import decorator import decorator
import contextlib import contextlib
import functools
import inspect
import sys import sys
import numpy as np import numpy as np
from paddle.fluid import core from paddle.fluid import core
...@@ -26,8 +27,8 @@ import objgraph ...@@ -26,8 +27,8 @@ import objgraph
from ..data_feeder import convert_dtype from ..data_feeder import convert_dtype
__all__ = [ __all__ = [
'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled', 'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
'to_variable' 'enabled', 'to_variable'
] ]
...@@ -167,7 +168,80 @@ def disable_dygraph(): ...@@ -167,7 +168,80 @@ def disable_dygraph():
_functional_dygraph_context_manager = None _functional_dygraph_context_manager = None
class no_grad: @signature_safe_contextmanager
def _switch_tracer_mode_guard_(is_train=True):
tracer = framework._dygraph_tracer()
if tracer:
mode = tracer._train_mode
tracer._train_mode = is_train
try:
yield
finally:
tracer._train_mode = mode
else:
yield
def no_grad(func=None):
"""
:api_attr: imperative
Create a context which disables dygraph gradient calculation.
In this mode, the result of every computation will have `stop_gradient=True`.
Also functions as a decorator. (Make sure to instantiate without parenthesis.)
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
# use as generator
data = np.array([[2, 3], [4, 5]]).astype('float32')
with fluid.dygraph.guard():
l0 = fluid.Linear(2, 2) # l0.weight.gradient() is None
l1 = fluid.Linear(2, 2)
with fluid.dygraph.no_grad():
# l1.weight.stop_gradient is False
tmp = l1.weight * 2 # tmp.stop_gradient is True
x = fluid.dygraph.to_variable(data)
y = l0(x) + tmp
o = l1(y)
o.backward()
print(tmp.gradient() is None) # True
print(l0.weight.gradient() is None) # False
# use as decorator
@fluid.dygraph.no_grad
def test_layer():
with fluid.dygraph.guard():
inp = np.ones([3, 1024], dtype='float32')
t = fluid.dygraph.base.to_variable(inp)
linear1 = fluid.Linear(1024, 4, bias_attr=False)
linear2 = fluid.Linear(4, 4)
ret = linear1(t)
dy_ret = linear2(ret)
test_layer()
"""
if func is None:
return _switch_tracer_mode_guard_(is_train=False)
else:
@decorator.decorator
def __impl__(func, *args, **kwargs):
with _switch_tracer_mode_guard_(is_train=False):
return func(*args, **kwargs)
return __impl__(func)
class no_grad_:
""" """
:api_attr: imperative :api_attr: imperative
......
...@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False): ...@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False):
# NOTE: `jit.save` doesn't save optimizer state # NOTE: `jit.save` doesn't save optimizer state
else: else:
# Load state dict by `save_dygraph` save format # Load state dict by `save_dygraph` save format
para_dict = {}
if os.path.exists(params_file_path): if os.path.exists(params_file_path):
with open(params_file_path, 'rb') as f: with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load( para_dict = pickle.load(f) if six.PY2 else pickle.load(
......
...@@ -16,9 +16,7 @@ import astor ...@@ -16,9 +16,7 @@ import astor
import gast import gast
from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api, is_to_variable from paddle.fluid.dygraph.dygraph_to_static import utils
from paddle.fluid.dygraph.dygraph_to_static.utils import to_assign_node, to_static_ast, update_args_of_func
from paddle.fluid.dygraph.dygraph_to_static.utils import dygraph_class_to_static_api
class BasicApiTransformer(gast.NodeTransformer): class BasicApiTransformer(gast.NodeTransformer):
...@@ -56,7 +54,7 @@ class BasicApiTransformer(gast.NodeTransformer): ...@@ -56,7 +54,7 @@ class BasicApiTransformer(gast.NodeTransformer):
if isinstance(child_node, gast.Call): if isinstance(child_node, gast.Call):
# TODO(liym27): # TODO(liym27):
# Considers that a dygraph api which modifies the input or has a output. # Considers that a dygraph api which modifies the input or has a output.
if is_dygraph_api(child_node): if utils.is_dygraph_api(child_node):
return return
else: else:
self._visit_Call(child_node) self._visit_Call(child_node)
...@@ -73,7 +71,7 @@ class BasicApiTransformer(gast.NodeTransformer): ...@@ -73,7 +71,7 @@ class BasicApiTransformer(gast.NodeTransformer):
if self._is_dygraph_forward(func_name): if self._is_dygraph_forward(func_name):
class_node = self._get_class_node(func_name) class_node = self._get_class_node(func_name)
static_node = to_static_ast(node, class_node) static_node = utils.to_static_ast(node, class_node)
return static_node return static_node
else: else:
return node return node
...@@ -91,14 +89,51 @@ class BasicApiTransformer(gast.NodeTransformer): ...@@ -91,14 +89,51 @@ class BasicApiTransformer(gast.NodeTransformer):
if is_to_variable(node_value): if is_to_variable(node_value):
return False return False
if is_dygraph_api(node_value): if utils.is_dygraph_api(node_value):
dygraph_api = node_value.func.attr dygraph_api = node_value.func.attr
if not dygraph_class_to_static_api.get(dygraph_api): if not utils.dygraph_class_to_static_api.get(dygraph_api):
return False return False
update_args_of_func(node_value, node_value, "__init__") utils.update_args_of_func(node_value, node_value, "__init__")
target_str = astor.to_source(gast.gast_to_ast(node.targets[0])) target_str = astor.to_source(gast.gast_to_ast(node.targets[0]))
self.class_node_dict[target_str] = node_value self.class_node_dict[target_str] = node_value
return True return True
# TODO: node.value is not dygraph class # TODO: node.value is not dygraph class
return False return False
def is_to_variable(node):
assert isinstance(node, gast.Call)
api_name = utils.ast_to_source_code(node.func).strip()
if utils.is_dygraph_api(node):
return api_name.endswith("to_variable")
if utils.is_paddle_api(node):
return api_name.endswith("to_tensor")
return False
def to_assign_node(node):
# Transform dygraph api `fluid.dygraph.to_variable` alias `paddle.to_tensor` to static api `fluid.layers.assign`.
# NOTE:
# 1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
# but api `assign` only supports {float32, float64, int32, int64, bool};
# 2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
assert isinstance(node, gast.Call)
assign_api = gast.parse('fluid.layers.assign').body[0].value
node.func = assign_api
if node.args:
node.args = [node.args[0]]
node.keywords = []
else:
for idx, kw in enumerate(node.keywords):
if kw.arg == 'value' or kw.arg == 'data':
node.keywords[idx].arg = 'input'
node.keywords = [node.keywords[idx]]
node.args = []
break
return node
...@@ -296,7 +296,7 @@ def convert_to_input_spec(inputs, input_spec): ...@@ -296,7 +296,7 @@ def convert_to_input_spec(inputs, input_spec):
elif isinstance(input_spec, dict): elif isinstance(input_spec, dict):
input_with_spec = {} input_with_spec = {}
check_type_and_len(inputs, input_spec, True) check_type_and_len(inputs, input_spec, True)
for name, input in inputs.items(): for name, input in six.iteritems(inputs):
if name in input_spec: if name in input_spec:
input_with_spec[name] = convert_to_input_spec(input, input_with_spec[name] = convert_to_input_spec(input,
input_spec[name]) input_spec[name])
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import six
import copy import copy
from collections import defaultdict from collections import defaultdict
...@@ -230,7 +231,7 @@ class NameVisitor(gast.NodeVisitor): ...@@ -230,7 +231,7 @@ class NameVisitor(gast.NodeVisitor):
return False return False
def _update_name_ids(self, new_name_ids): def _update_name_ids(self, new_name_ids):
for name_id, ctxs in new_name_ids.items(): for name_id, ctxs in six.iteritems(new_name_ids):
self.name_ids[name_id] = ctxs + self.name_ids[name_id] self.name_ids[name_id] = ctxs + self.name_ids[name_id]
...@@ -250,7 +251,7 @@ def parse_cond_args(var_ids_dict, return_ids=None, ctx=gast.Load): ...@@ -250,7 +251,7 @@ def parse_cond_args(var_ids_dict, return_ids=None, ctx=gast.Load):
""" """
name_ids = [ name_ids = [
var_id for var_id, var_ctx in var_ids_dict.items() var_id for var_id, var_ctx in six.iteritems(var_ids_dict)
if isinstance(var_ctx[0], ctx) if isinstance(var_ctx[0], ctx)
] ]
if return_ids: if return_ids:
...@@ -341,7 +342,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict, ...@@ -341,7 +342,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
def _vars_with_store(ids_dict): def _vars_with_store(ids_dict):
vars = [] vars = []
for k, ctxs in ids_dict.items(): for k, ctxs in six.iteritems(ids_dict):
if _is_return_var(ctxs): if _is_return_var(ctxs):
vars.append(k) vars.append(k)
return vars return vars
...@@ -353,7 +354,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict, ...@@ -353,7 +354,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
def _vars_loaded_before_store(ids_dict): def _vars_loaded_before_store(ids_dict):
new_dict = defaultdict(list) new_dict = defaultdict(list)
for k, ctxs in ids_dict.items(): for k, ctxs in six.iteritems(ids_dict):
for ctx in ctxs: for ctx in ctxs:
if isinstance(ctx, gast.Load): if isinstance(ctx, gast.Load):
new_dict[k].append(ctx) new_dict[k].append(ctx)
......
...@@ -98,8 +98,15 @@ class TranslatorLogger(object): ...@@ -98,8 +98,15 @@ class TranslatorLogger(object):
return level == self.transformed_code_level return level == self.transformed_code_level
def has_verbosity(self, level): def has_verbosity(self, level):
"""
Checks whether the verbosity level set by the user is greater than or equal to the log level.
Args:
level(int): The level of log.
Returns:
True if the verbosity level set by the user is greater than or equal to the log level, otherwise False.
"""
level = self.check_level(level) level = self.check_level(level)
return level >= self.verbosity_level return self.verbosity_level >= level
def error(self, msg, *args, **kwargs): def error(self, msg, *args, **kwargs):
self.logger.error(msg, *args, **kwargs) self.logger.error(msg, *args, **kwargs)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import logging import logging
import six
from paddle.fluid import log_helper from paddle.fluid import log_helper
from paddle.fluid import framework, backward, core from paddle.fluid import framework, backward, core
...@@ -334,7 +335,7 @@ class PartialProgramLayer(layers.Layer): ...@@ -334,7 +335,7 @@ class PartialProgramLayer(layers.Layer):
param_and_buffer_names_set.add(var.name) param_and_buffer_names_set.add(var.name)
for block in main_program.blocks: for block in main_program.blocks:
for name, var in block.vars.items(): for name, var in six.iteritems(block.vars):
if isinstance(var, framework.Parameter): if isinstance(var, framework.Parameter):
if name not in param_and_buffer_names_set: if name not in param_and_buffer_names_set:
raise ValueError( raise ValueError(
......
...@@ -24,6 +24,7 @@ import warnings ...@@ -24,6 +24,7 @@ import warnings
import gast import gast
from paddle.fluid import framework from paddle.fluid import framework
from paddle.fluid import in_dygraph_mode
from paddle.fluid.dygraph import layers from paddle.fluid.dygraph import layers
from paddle.fluid.data_feeder import check_type from paddle.fluid.data_feeder import check_type
from paddle.fluid.layers.utils import flatten from paddle.fluid.layers.utils import flatten
...@@ -32,6 +33,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph ...@@ -32,6 +33,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
from paddle.fluid.dygraph.dygraph_to_static import logging_utils
from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
...@@ -283,13 +285,21 @@ class StaticLayer(object): ...@@ -283,13 +285,21 @@ class StaticLayer(object):
Return: Return:
Outputs of decorated function. Outputs of decorated function.
""" """
# 1. call dygraph function directly if not enable `declarative` # 1. call dygraph function directly if not enable `declarative`
if not self._program_trans.enable_declarative: if not self._program_trans.enable_declarative:
warnings.warn( logging_utils.warn(
"The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. " "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
"We will just return dygraph output.") "We will just return dygraph output.")
return self._call_dygraph_function(*args, **kwargs) return self._call_dygraph_function(*args, **kwargs)
if not in_dygraph_mode() and self._program_trans.enable_declarative:
raise RuntimeError(
"Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
"because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
"following API: paddle.disable_static().".format(
self.dygraph_function))
# 2. trace ops from dygraph layers and cache the generated program. # 2. trace ops from dygraph layers and cache the generated program.
args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs) args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
try: try:
...@@ -393,19 +403,43 @@ class StaticLayer(object): ...@@ -393,19 +403,43 @@ class StaticLayer(object):
def concrete_program(self): def concrete_program(self):
""" """
Returns recent ConcreteProgram instance of decorated function. Returns recent ConcreteProgram instance of decorated function.
Examples:
.. code-block:: python
import paddle
from paddle.jit import to_static
from paddle.static import InputSpec
paddle.disable_static()
def foo(x, y):
z = x + y
return z
# usage 1:
decorated_foo = to_static(foo, input_spec=[InputSpec([10], name='x'), InputSpec([10], name='y')])
print(decorated_foo.concrete_program)
# usage 2:
decorated_foo = to_static(foo)
out_foo = decorated_foo(paddle.rand([10]), paddle.rand([10]))
print(decorated_foo.concrete_program)
""" """
# if specific the `input_spec`, the length of program_cache will always 1, # if specific the `input_spec`, the length of program_cache will always 1,
# else, return the last one. # else, return the last one.
cached_program_len = len(self._program_cache) cached_program_len = len(self._program_cache)
# If specific `input_spec`, apply convertion from dygraph layers into static Program. # If specific `input_spec`, apply convertion from dygraph layers into static Program.
if cached_program_len == 0: if cached_program_len == 0:
if len(self._function_spec.flat_input_spec) > 0:
input_spec = self._function_spec.input_spec input_spec = self._function_spec.input_spec
has_input_spec = (input_spec is not None and len(input_spec) > 0)
if has_input_spec:
concrete_program, _ = self.get_concrete_program(*input_spec) concrete_program, _ = self.get_concrete_program(*input_spec)
return concrete_program return concrete_program
else: else:
raise ValueError("No valid transformed program for {}".format( raise ValueError(
self._function_spec)) "No valid transformed program for {}.\n\t Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
format(self._function_spec))
# If more than one programs have been cached, return the recent converted program by default. # If more than one programs have been cached, return the recent converted program by default.
elif cached_program_len > 1: elif cached_program_len > 1:
logging.warning( logging.warning(
...@@ -617,7 +651,7 @@ class ProgramCache(object): ...@@ -617,7 +651,7 @@ class ProgramCache(object):
return len(self._caches) return len(self._caches)
def concrete_programs(self): def concrete_programs(self):
return [cp for key, (cp, _) in self._caches.iteritems()] return [cp for key, (cp, _) in six.iteritems(self._caches)]
def synchronized(func): def synchronized(func):
......
...@@ -136,9 +136,12 @@ def is_api_in_module(node, module_prefix): ...@@ -136,9 +136,12 @@ def is_api_in_module(node, module_prefix):
# import_str = "".join(import_statements) # import_str = "".join(import_statements)
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph import to_variable
import paddle.fluid.dygraph as dygraph from paddle import to_tensor
return eval("_is_api_in_module_helper({}, '{}')".format(func_str, return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
module_prefix)) module_prefix))
except NameError: except NameError:
...@@ -146,15 +149,18 @@ def is_api_in_module(node, module_prefix): ...@@ -146,15 +149,18 @@ def is_api_in_module(node, module_prefix):
def is_dygraph_api(node): def is_dygraph_api(node):
# Note: A api in module dygraph_to_static is not a real dygraph api. # Note: A api in module dygraph_to_static is not a real dygraph api.
if is_api_in_module(node, "paddle.fluid.dygraph.dygraph_to_static"): if is_api_in_module(node, "paddle.fluid.dygraph.dygraph_to_static"):
return False return False
# TODO(liym27): A better way to determine whether it is a dygraph api.
# Consider the decorator @dygraph_only
return is_api_in_module(node, "paddle.fluid.dygraph") return is_api_in_module(node, "paddle.fluid.dygraph")
def is_paddle_api(node): def is_paddle_api(node):
return is_api_in_module(node, "paddle.fluid") return is_api_in_module(node, "paddle")
# Is numpy_api cannot reuse is_api_in_module because of numpy module problem # Is numpy_api cannot reuse is_api_in_module because of numpy module problem
...@@ -233,14 +239,6 @@ def _add_keywords_to(node, dygraph_api_name): ...@@ -233,14 +239,6 @@ def _add_keywords_to(node, dygraph_api_name):
return return
def is_to_variable(node):
assert isinstance(node, gast.Call)
if is_dygraph_api(node):
api_name = ast_to_source_code(node.func).strip()
return api_name.endswith("to_variable")
return False
def to_static_ast(node, class_node): def to_static_ast(node, class_node):
assert isinstance(node, gast.Call) assert isinstance(node, gast.Call)
assert isinstance(class_node, gast.Call) assert isinstance(class_node, gast.Call)
...@@ -268,29 +266,6 @@ def to_static_ast(node, class_node): ...@@ -268,29 +266,6 @@ def to_static_ast(node, class_node):
return node return node
def to_assign_node(node):
# Transform dygraph api `fluid.dygraph.to_variable` to static api `fluid.layers.assign`.
# NOTE:
# 1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
# but api `assign` only supports {float32, float64, int32, int64, bool};
# 2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
assert isinstance(node, gast.Call)
assign_api = gast.parse('fluid.layers.assign').body[0].value
node.func = assign_api
if node.args:
node.args = [node.args[0]]
node.keywords = []
else:
for idx, kw in enumerate(node.keywords):
if kw.arg == 'value':
node.keywords[idx].arg = 'input'
node.keywords = [node.keywords[idx]]
node.args = []
break
return node
def update_args_of_func(node, dygraph_node, method_name): def update_args_of_func(node, dygraph_node, method_name):
assert isinstance(node, gast.Call) assert isinstance(node, gast.Call)
if method_name not in ["__init__", "forward"]: if method_name not in ["__init__", "forward"]:
...@@ -493,7 +468,7 @@ def recover_globals_attribute(src_obj, dst_obj): ...@@ -493,7 +468,7 @@ def recover_globals_attribute(src_obj, dst_obj):
src_globals = getattr(src_obj, attr_name, {}) src_globals = getattr(src_obj, attr_name, {})
dst_globals = getattr(dst_obj, attr_name, {}) dst_globals = getattr(dst_obj, attr_name, {})
for k, v in src_globals.items(): for k, v in six.iteritems(src_globals):
# ignore builtin attribute. # ignore builtin attribute.
if not (k.startswith('__') and k.endswith('__')): if not (k.startswith('__') and k.endswith('__')):
dst_globals[k] = v dst_globals[k] = v
......
...@@ -754,7 +754,7 @@ def save(layer, model_path, input_spec=None, configs=None): ...@@ -754,7 +754,7 @@ def save(layer, model_path, input_spec=None, configs=None):
# saved to inference program may not need by dygraph Layer, # saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name # we only record the state_dict variable's structured name
state_names_dict = dict() state_names_dict = dict()
for structured_name, var in layer.state_dict().items(): for structured_name, var in six.iteritems(layer.state_dict()):
state_names_dict[var.name] = structured_name state_names_dict[var.name] = structured_name
# 3. share parameters from Layer to scope & record var info # 3. share parameters from Layer to scope & record var info
......
...@@ -41,7 +41,7 @@ def monkey_patch_math_varbase(): ...@@ -41,7 +41,7 @@ def monkey_patch_math_varbase():
The difference is, in dygraph mode, use auto-generated op functions for better performance. The difference is, in dygraph mode, use auto-generated op functions for better performance.
""" """
@no_grad() @no_grad
def create_tensor(value, dtype, shape): def create_tensor(value, dtype, shape):
out = _varbase_creator(dtype=dtype) out = _varbase_creator(dtype=dtype)
out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape, out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
......
...@@ -349,38 +349,53 @@ class DataParallel(layers.Layer): ...@@ -349,38 +349,53 @@ class DataParallel(layers.Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np import paddle
import paddle.fluid as fluid import paddle.nn as nn
import paddle.optimizer as opt
import paddle.distributed as dist
place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) class LinearNet(nn.Layer):
with fluid.dygraph.guard(place): def __init__(self):
super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1)
# prepare the data parallel context def forward(self, x):
strategy = fluid.dygraph.prepare_context() return self._linear2(self._linear1(x))
linear = fluid.dygraph.Linear(1, 10, act="softmax") def train():
adam = fluid.optimizer.AdamOptimizer( # 1. enable dynamic mode
learning_rate=0.001, parameter_list=linear.parameters()) paddle.disable_static()
# make the module become the data parallelism module # 2. initialize parallel environment
linear = fluid.dygraph.DataParallel(linear, strategy) dist.init_parallel_env()
x_data = np.random.random(size=[10, 1]).astype(np.float32) # 3. create data parallel layer & optimizer
data = fluid.dygraph.to_variable(x_data) layer = LinearNet()
dp_layer = paddle.DataParallel(layer)
hidden = linear(data) loss_fn = nn.MSELoss()
avg_loss = fluid.layers.mean(hidden) adam = opt.Adam(
learning_rate=0.001, parameters=dp_layer.parameters())
# scale the loss according to the number of trainers. # 4. run layer
avg_loss = linear.scale_loss(avg_loss) inputs = paddle.randn([10, 10], 'float32')
outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels)
avg_loss.backward() loss = dp_layer.scale_loss(loss)
loss.backward()
dp_layer.apply_collective_grads()
# collect the gradients of trainers. adam.step()
linear.apply_collective_grads() adam.clear_grad()
adam.minimize(avg_loss) if __name__ == '__main__':
linear.clear_gradients() # 1. start by ``paddle.distributed.spawn`` (default)
dist.spawn(train, nprocs=2)
# 2. start by ``paddle.distributed.launch``
# train()
""" """
if not self._is_data_parallel_mode(): if not self._is_data_parallel_mode():
return loss return loss
...@@ -430,7 +445,7 @@ class DataParallel(layers.Layer): ...@@ -430,7 +445,7 @@ class DataParallel(layers.Layer):
self._reshape_inplace(x=g_var, shape=g_shape) self._reshape_inplace(x=g_var, shape=g_shape)
assert g_var.shape == g_shape assert g_var.shape == g_shape
@no_grad() @no_grad
def apply_collective_grads(self): def apply_collective_grads(self):
""" """
AllReduce the Parameters' gradient. AllReduce the Parameters' gradient.
...@@ -438,38 +453,53 @@ class DataParallel(layers.Layer): ...@@ -438,38 +453,53 @@ class DataParallel(layers.Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
import numpy as np import paddle
import paddle.fluid as fluid import paddle.nn as nn
import paddle.optimizer as opt
import paddle.distributed as dist
place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) class LinearNet(nn.Layer):
with fluid.dygraph.guard(place): def __init__(self):
super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1)
# prepare the data parallel context def forward(self, x):
strategy = fluid.dygraph.prepare_context() return self._linear2(self._linear1(x))
linear = fluid.dygraph.Linear(1, 10, act="softmax") def train():
adam = fluid.optimizer.AdamOptimizer( # 1. enable dynamic mode
learning_rate=0.001, parameter_list=linear.parameters()) paddle.disable_static()
# make the module become the data parallelism module # 2. initialize parallel environment
linear = fluid.dygraph.DataParallel(linear, strategy) dist.init_parallel_env()
x_data = np.random.random(size=[10, 1]).astype(np.float32) # 3. create data parallel layer & optimizer
data = fluid.dygraph.to_variable(x_data) layer = LinearNet()
dp_layer = paddle.DataParallel(layer)
hidden = linear(data) loss_fn = nn.MSELoss()
avg_loss = fluid.layers.mean(hidden) adam = opt.Adam(
learning_rate=0.001, parameters=dp_layer.parameters())
# scale the loss according to the number of trainers. # 4. run layer
avg_loss = linear.scale_loss(avg_loss) inputs = paddle.randn([10, 10], 'float32')
outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels)
avg_loss.backward() loss = dp_layer.scale_loss(loss)
loss.backward()
dp_layer.apply_collective_grads()
# collect the gradients of trainers. adam.step()
linear.apply_collective_grads() adam.clear_grad()
adam.minimize(avg_loss) if __name__ == '__main__':
linear.clear_gradients() # 1. start by ``paddle.distributed.spawn`` (default)
dist.spawn(train, nprocs=2)
# 2. start by ``paddle.distributed.launch``
# train()
""" """
if not self._is_data_parallel_mode(): if not self._is_data_parallel_mode():
return return
......
...@@ -145,7 +145,7 @@ class Fleet(object): ...@@ -145,7 +145,7 @@ class Fleet(object):
Returns: Returns:
bool: True if this is a node of server, bool: True if this is a node of server,
False if not. False if not
""" """
return self._role_maker.is_server() return self._role_maker.is_server()
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册