未验证 提交 3232618a 编写于 作者: G gongweibao 提交者: GitHub

checkerrpick Make fuse_all_reduce_op_pass support mix_precision test=develop test=release (#18490)

上级 24107006
......@@ -58,15 +58,15 @@ constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
typedef std::string FusedOptType;
constexpr char kFusedOptType[] = "fused_opt_type";
typedef std::string FusedGrads;
typedef std::vector<std::string> FusedGrads;
constexpr char kFusedGrads[] = "fused_gradients";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads";
typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupGradsAndParams;
constexpr char kGroupGradsAndParams[] = "group_grads_params";
GroupParamsAndGrads;
constexpr char kGroupParamsAndGrads[] = "group_params_grads";
} // namespace details
} // namespace framework
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h"
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
......@@ -52,18 +53,13 @@ static constexpr double kMB = 1048576.0;
void SetFuseParameterGroupsSize(int group_size) {
FLAGS_fuse_parameter_groups_size = group_size;
}
int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
void SetFuseParameterMemorySize(double memory_size) {
FLAGS_fuse_parameter_memory_size = memory_size;
}
double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; }
static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL;
class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const {
......@@ -73,19 +69,16 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
ResetAttribute<details::ParamsAndGrads>(details::kParamsAndGrads, &result);
ResetAttribute<details::GroupGradsAndParams>(details::kGroupGradsAndParams,
ResetAttribute<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads,
&result);
// NOTE: The operator nodes should be in topology order.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
auto &params_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndGrads);
for (auto &node : topo_nodes) {
RecordParamsAndGrads(node, &params_grads);
}
RecordParamsAndGrads(result, &params_grads);
if (params_grads.size() == 0) {
LOG(INFO) << "Doesn't find gradients";
auto num_params_grads = params_grads.size();
VLOG(10) << "The number of params and grads is:" << num_params_grads;
if (num_params_grads == 0) {
return;
}
......@@ -101,24 +94,43 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
}
auto &group_grads_params =
result.Get<details::GroupGradsAndParams>(details::kGroupGradsAndParams);
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupGradsAndParams(var_name2node, params_grads, &group_grads_params);
auto &group_params_grads =
result.Get<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads);
// Note: the order of params_grads may be changed by SetGroupParamsAndGrads.
SetGroupParamsAndGrads(var_name2node, params_grads, &group_params_grads);
params_grads.clear();
for (auto &group_p_g : group_grads_params) {
params_grads.insert(params_grads.begin(), group_p_g.begin(),
params_grads.reserve(num_params_grads);
for (auto &group_p_g : group_params_grads) {
params_grads.insert(params_grads.end(), group_p_g.begin(),
group_p_g.end());
}
for (auto &p_g : params_grads) {
std::swap(p_g.first, p_g.second);
}
PADDLE_ENFORCE_EQ(
num_params_grads, params_grads.size(),
"The number of params_grads is not consistent with before.");
if (IsUnifiedDtype(params_grads, var_name2node)) {
SetGradientPersistable(params_grads, var_name2node, var_name2node_set);
AllocContinuousAddressSpace(places, local_scopes, var_name2node,
params_grads, &result);
} else {
// Set Gradients as Persistable to prevent this var becoming reusable.
auto dtype = kDefaultDtype;
for (auto &p_g : params_grads) {
for (auto &sub_param_grad : group_params_grads) {
SetGradientPersistable(params_grads, var_name2node, var_name2node_set);
PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, var_name2node),
"The data type of the same group is not consistent.");
AllocContinuousAddressSpace(places, local_scopes, var_name2node,
sub_param_grad, &result);
}
}
}
void SetGradientPersistable(
const std::vector<std::pair<std::string, std::string>> &sub_param_grad,
const std::unordered_map<std::string, Node *> &var_name2node,
const std::unordered_map<std::string, std::unordered_set<ir::Node *>>
&var_name2node_set) const {
for (auto &p_g : sub_param_grad) {
// Get gradient var
auto iter = var_name2node.find(p_g.second);
PADDLE_ENFORCE(iter != var_name2node.end(), "%s is not found.",
......@@ -132,32 +144,45 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
}
PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
}
}
// Get Dtype
auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == kDefaultDtype) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
"The data type should not be bool.");
bool IsUnifiedDtype(
const details::ParamsAndGrads &params_grads,
const std::unordered_map<std::string, Node *> &var_name2node) const {
auto dtype =
this->GetDtypeOfVar(var_name2node, params_grads.front().second);
for (auto p_g : params_grads) {
auto next_dtype = this->GetDtypeOfVar(var_name2node, p_g.second);
if (next_dtype != dtype) {
return false;
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype,
"The data type of input is not consistent.");
}
return true;
}
void AllocContinuousAddressSpace(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, Node *> &var_name2node,
const details::ParamsAndGrads &params_grads, Graph *result) const {
// Create a FusedVarsSet to avoid duplicating names for fused_var in other
// pass.
if (!result.Has(details::kFusedVars)) {
result.Set(details::kFusedVars, new details::FusedVars);
if (!result->Has(details::kFusedVars)) {
result->Set(details::kFusedVars, new details::FusedVars);
}
// the kFusedGrads is used be fuse_optimizer_op_pass.
result.Set(details::kFusedGrads, new details::FusedGrads);
if (!result->Has(details::kFusedGrads)) {
result->Set(details::kFusedGrads, new details::FusedGrads);
}
// the fused_var_name should be unique, so it appends
// params_grads.begin()->second.
auto fused_var_name = std::string(details::kFusedVarNamePrefix) + "@GRAD@" +
params_grads.begin()->second;
result.Get<details::FusedGrads>(details::kFusedGrads) = fused_var_name;
auto &fused_var_set = result.Get<details::FusedVars>(details::kFusedVars);
result->Get<details::FusedGrads>(details::kFusedGrads)
.emplace_back(fused_var_name);
auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
"%s is duplicate in FusedVars.", fused_var_name);
fused_var_set.insert(fused_var_name);
......@@ -175,109 +200,126 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
graph->Set(attr_name, new AttrType);
}
void SetGroupGradsAndParams(
void SetGroupParamsAndGrads(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const details::ParamsAndGrads &params_grads,
details::GroupGradsAndParams *group_grads_params) const {
SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
details::GroupParamsAndGrads *group_params_grads) const {
SetGroupAccordingToLayers(var_nodes, params_grads, group_params_grads);
SetGroupAccordingToMemorySize(var_nodes, group_params_grads);
}
void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const details::ParamsAndGrads &params_grads,
details::GroupGradsAndParams *group_grads_params) const {
std::unordered_map<std::string, std::vector<int>> layer_params;
details::GroupParamsAndGrads *group_params_grads) const {
using var_dtype = std::pair<std::string, proto::VarType::Type>;
std::map<var_dtype, size_t> var_idx;
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
auto dtype = GetDtypeOfVar(var_nodes, params_grads[i].second);
var_dtype var_key;
if (pos == std::string::npos) {
layer_params[params_grads[i].first].emplace_back(i);
var_key = std::make_pair(params_grads[i].first, dtype);
} else {
layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
}
var_key = std::make_pair(params_grads[i].first.substr(0, pos), dtype);
}
group_grads_params->reserve(layer_params.size());
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
std::string key = params_grads[i].first;
if (pos != std::string::npos) {
key = params_grads[i].first.substr(0, pos);
}
auto iter = layer_params.find(key);
if (iter == layer_params.end()) continue;
group_grads_params->emplace_back();
auto &local_group_grads_params = group_grads_params->back();
for (auto &idx : iter->second) {
local_group_grads_params.emplace_back(
std::make_pair(params_grads[idx].second, params_grads[idx].first));
size_t idx = 0;
auto var_idx_iter = var_idx.find(var_key);
if (var_idx_iter != var_idx.end()) {
idx = var_idx_iter->second;
} else {
group_params_grads->emplace_back();
idx = group_params_grads->size() - 1;
var_idx[var_key] = idx;
}
layer_params.erase(iter);
auto &local_group_params_grads = group_params_grads->at(idx);
local_group_params_grads.emplace_back(
std::make_pair(params_grads[i].first, params_grads[i].second));
}
VLOG(10) << "SetGroupAccordingToLayers: ";
if (VLOG_IS_ON(10)) {
PrintGroupInfo(var_nodes, group_grads_params);
VLOG(10) << "SetGroupAccordingToLayers: ";
PrintGroupInfo(var_nodes, group_params_grads);
}
}
void PrintGroupInfo(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
details::GroupGradsAndParams *group_grads_params) const {
for (size_t i = 0; i < group_grads_params->size(); ++i) {
details::GroupParamsAndGrads *group_params_grads) const {
for (size_t i = 0; i < group_params_grads->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
size_t gps_size = 0;
for (auto &g_p : group_grads_params->at(i)) {
auto iter = var_nodes.find(g_p.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
for (auto &p_g : group_params_grads->at(i)) {
auto iter = var_nodes.find(p_g.first);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", p_g.first);
auto shape = iter->second->Var()->GetShape();
size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
gps_size += size;
out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second);
}
auto dtype = this->GetDtypeOfVar(var_nodes,
group_params_grads->at(i).front().first);
VLOG(10) << out.str()
<< ", group size:" << group_grads_params->at(i).size()
<< ", group size:" << group_params_grads->at(i).size()
<< ", group memory size:" << static_cast<double>(gps_size) / kMB
<< "(MB)";
<< "(MB)"
<< ", dtype:" << dtype;
}
}
void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
details::GroupGradsAndParams *group_grads_params) const {
details::GroupParamsAndGrads *group_params_grads) const {
const double group_memory_size = GetFuseParameterMemorySize();
if (group_memory_size <= 0.0) {
return;
}
details::GroupGradsAndParams local_group_grads_params;
details::GroupParamsAndGrads local_group_params_grads;
size_t j = 0;
while (j < group_grads_params->size()) {
local_group_grads_params.emplace_back();
auto &group_p_g = local_group_grads_params.back();
while (j < group_params_grads->size()) {
local_group_params_grads.emplace_back();
auto &group_p_g = local_group_params_grads.back();
auto &grad_name = group_params_grads->at(j).front().second;
auto var_type = GetDtypeOfVar(var_nodes, grad_name);
size_t local_group_memory_size = 0;
while (j < group_grads_params->size()) {
while (j < group_params_grads->size()) {
std::for_each(
group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
group_params_grads->at(j).begin(), group_params_grads->at(j).end(),
[&local_group_memory_size,
&var_nodes](const std::pair<std::string, std::string> &g_p) {
auto iter = var_nodes.find(g_p.second);
&var_nodes](const std::pair<std::string, std::string> &p_g) {
auto iter = var_nodes.find(p_g.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
g_p.second);
auto shape = iter->second->Var()->GetShape();
p_g.second);
size_t size =
framework::SizeOfType(iter->second->Var()->GetDataType());
auto shape = iter->second->Var()->GetShape();
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
local_group_memory_size += size;
});
group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
group_grads_params->at(j).end());
group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(),
group_params_grads->at(j).end());
++j;
if (j >= group_params_grads->size()) {
break;
}
if (GetFuseParameterGroupsSize() > 1 &&
group_p_g.size() >
static_cast<size_t>(GetFuseParameterGroupsSize())) {
......@@ -288,17 +330,31 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
group_memory_size) {
break;
}
auto next_var_type =
GetDtypeOfVar(var_nodes, group_params_grads->at(j).front().second);
if (next_var_type != var_type) {
break;
}
}
}
std::swap(*group_grads_params, local_group_grads_params);
std::swap(*group_params_grads, local_group_params_grads);
if (VLOG_IS_ON(10)) {
VLOG(10) << string::Sprintf(
"SetGroupAccordingToMemorySize(memory_size: %f):", group_memory_size);
if (VLOG_IS_ON(10)) {
PrintGroupInfo(var_nodes, group_grads_params);
PrintGroupInfo(var_nodes, group_params_grads);
}
}
proto::VarType::Type GetDtypeOfVar(
const std::unordered_map<std::string, Node *> &var_nodes,
const std::string &name) const {
auto grad_iter = var_nodes.find(name);
PADDLE_ENFORCE(grad_iter != var_nodes.end());
PADDLE_ENFORCE_NOT_NULL(grad_iter->second->Var());
return grad_iter->second->Var()->GetDataType();
}
private:
......@@ -307,22 +363,22 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
return type == proto::VarType::LOD_TENSOR;
}
void RecordParamsAndGrads(ir::Node *node,
void RecordParamsAndGrads(const ir::Graph &graph,
details::ParamsAndGrads *params_grads) const {
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(graph);
for (auto &node : topo_nodes) {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) return;
if (!is_bk_op) continue;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
for (size_t i = 0; i < backward_vars.size(); i += 2) {
VLOG(10) << "Trainable parameter: " << backward_vars[i]
<< ", gradient: " << backward_vars[i + 1];
......@@ -333,6 +389,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
} catch (boost::bad_get e) {
}
}
}
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
......
......@@ -101,10 +101,17 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
"this pass.");
}
auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
PADDLE_ENFORCE_NE(fused_grad.size(), 0,
"The fused gradient should not be empty.");
PADDLE_ENFORCE_EQ(fused_grad.size(), 1,
"Because the dtype of those gradients "
"is not unified, so the number of fused gradients is "
"more than one, but it is not supported currently.");
auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
auto iter =
std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name[kGrad] = fused_grad;
fused_vars_name[kGrad] = fused_grad.front();
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
......
......@@ -30,7 +30,6 @@ class FuseAllReduceOpPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph *graph) const override {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
......@@ -38,38 +37,17 @@ class FuseAllReduceOpPass : public ir::Pass {
&Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
#endif
std::unordered_set<std::string> grads;
auto &params_grads =
result.Get<details::ParamsAndGrads>(details::kParamsAndGrads);
size_t num_of_all_reduce = params_grads.size();
std::unordered_set<std::string> grads;
grads.reserve(num_of_all_reduce);
for (auto p_g : params_grads) {
grads.insert(p_g.second);
}
size_t num_place = places.size();
std::unordered_map<std::string, ir::Node *> all_reduce_ops;
all_reduce_ops.reserve(grads.size());
for (auto &node : result.Nodes()) {
if (node->IsOp()) {
PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
auto *all_reduce_op_handle = dynamic_cast<details::AllReduceOpHandle *>(
&node->Wrapper<details::OpHandleBase>());
if (all_reduce_op_handle) {
auto inputs = details::DynamicCast<details::VarHandle>(
all_reduce_op_handle->Inputs());
PADDLE_ENFORCE_EQ(inputs.size(), num_place);
// The inputs' name should be the same.
auto &grad_name = inputs[0]->name();
for (size_t i = 1; i < inputs.size(); ++i) {
PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
"The input name should be the same.");
}
PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
all_reduce_ops.emplace(grad_name, node);
}
}
}
std::unordered_map<std::string, Node *> all_reduce_ops =
GetAllReduceOps(result, places, grads);
VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
if (all_reduce_ops.size() == 0) {
......@@ -82,16 +60,16 @@ class FuseAllReduceOpPass : public ir::Pass {
"it is not supported currently.");
VLOG(10) << "Insert fused_all_reduce";
auto &group_grads_params =
graph->Get<details::GroupGradsAndParams>(details::kGroupGradsAndParams);
auto &group_params_grads =
graph->Get<details::GroupParamsAndGrads>(details::kGroupParamsAndGrads);
for (auto &group_g_p : group_grads_params) {
size_t group_size = group_g_p.size();
for (auto &group_p_g : group_params_grads) {
size_t group_size = group_p_g.size();
PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
std::vector<ir::Node *> group_all_reduce_ops;
group_all_reduce_ops.reserve(group_size);
for (auto &g_p : group_g_p) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
for (auto &p_g : group_p_g) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce(places, local_scopes, group_size,
......@@ -103,6 +81,35 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}
std::unordered_map<std::string, Node *> GetAllReduceOps(
const Graph &result, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &grads) const {
size_t num_place = places.size();
std::unordered_map<std::string, Node *> all_reduce_ops;
all_reduce_ops.reserve(grads.size());
for (auto &node : result.Nodes()) {
if (node->IsOp()) {
PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
auto *all_reduce_op_handle = dynamic_cast<details::AllReduceOpHandle *>(
&node->Wrapper<details::OpHandleBase>());
if (all_reduce_op_handle) {
auto inputs = details::DynamicCast<details::VarHandle>(
all_reduce_op_handle->Inputs());
PADDLE_ENFORCE_EQ(inputs.size(), num_place);
// The inputs' name should be the same.
auto &grad_name = inputs[0]->name();
for (size_t i = 1; i < inputs.size(); ++i) {
PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
"The input name should be the same.");
}
PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
all_reduce_ops.emplace(grad_name, node);
}
}
}
return all_reduce_ops;
}
void InsertFusedAllReduce(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce,
......
......@@ -227,8 +227,11 @@ REGISTER_OPERATOR(alloc_continuous_space,
paddle::operators::AllocContinuousSpaceOp,
paddle::operators::AllocContinuousSpaceOpMaker);
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CPU_KERNEL(
alloc_continuous_space,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
plat::float16>,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, int>,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext, float>,
ops::AllocContinuousSpaceKernel<paddle::platform::CPUDeviceContext,
......@@ -237,6 +240,8 @@ REGISTER_OP_CPU_KERNEL(
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL(
alloc_continuous_space,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, int>,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext, float>,
ops::AllocContinuousSpaceKernel<paddle::platform::CUDADeviceContext,
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>
namespace paddle {
namespace operators {
......@@ -46,6 +46,17 @@ class SGDOp : public framework::OperatorWithKernel {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(data_type, ctx.device_context());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "LearningRate") {
return framework::OpKernelType(tensor.type(), tensor.place(),
tensor.layout());
}
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
};
class SGDOpInferVarType : public framework::VarTypeInference {
......
......@@ -46,7 +46,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
// Atomic Operation to avoid concurrent write error.
paddle::platform::CudaAtomicAdd(
tensor_out_ptr + index,
-1.0 * learning_rate[0] * selected_rows_ptr[index]);
-static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
}
}
}
......@@ -122,5 +122,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
ops::SGDOpCUDAKernel<double>);
ops::SGDOpCUDAKernel<double>,
ops::SGDOpCUDAKernel<plat::float16>);
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle.fluid.core as core
import math
import os
import sys
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from simple_nets import init_data
from parallel_executor_test_base import TestParallelExecutorBase
batch_size = 12
img_shape = [1, 28, 28]
def loss_net(hidden, label):
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
def conv_net(use_feed):
img = fluid.layers.data(name='image', shape=img_shape, dtype='float16')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
hidden = fluid.layers.cast(conv_pool_2, np.float32)
return loss_net(hidden, label)
def _optimizer(learning_rate=1e-6):
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
return optimizer
class TestResnet(TestParallelExecutorBase):
def check_model(self, use_cuda):
img, label = init_data(
batch_size=batch_size, img_shape=img_shape, label_range=9)
img = np.float16(img).view(np.uint16)
feed_dict = {"image": img, "label": label}
TestParallelExecutorBase.check_network_convergence(
conv_net,
feed_dict=feed_dict,
iter=10,
use_cuda=use_cuda,
fuse_all_reduce_ops=True,
optimizer=_optimizer)
def test_model(self):
if core.is_compiled_with_cuda():
self.check_model(True)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册