Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a9f5f822
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a9f5f822
编写于
10月 17, 2018
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
use binary search. test=develop
上级
38612695
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
335 addition
and
171 deletion
+335
-171
paddle/fluid/operators/momentum_op.cc
paddle/fluid/operators/momentum_op.cc
+8
-3
paddle/fluid/operators/momentum_op.cu
paddle/fluid/operators/momentum_op.cu
+3
-121
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+324
-47
未找到文件。
paddle/fluid/operators/momentum_op.cc
浏览文件 @
a9f5f822
...
...
@@ -74,9 +74,13 @@ class MomentumOpInferVarType : public framework::VarTypeInference {
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
}
else
if
(
block
->
FindRecursiveOrCreateVar
(
input_var
).
GetType
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
else
{
PADDLE_THROW
(
"Only support LodTensor and SelectedRows, Unexpected Input Type."
);
}
}
}
...
...
@@ -135,5 +139,6 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
momentum
,
ops
::
MomentumOp
,
ops
::
MomentumOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
MomentumOpInferVarType
);
REGISTER_OP_CPU_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
float
>
,
ops
::
MomentumOpKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/momentum_op.cu
浏览文件 @
a9f5f822
...
...
@@ -15,125 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/momentum_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
MomentumKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
bool
use_nesterov
,
T
*
p_out
,
T
*
v_out
)
{
T
lr
=
learning_rate
[
0
];
if
(
use_nesterov
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
g_val
=
g
[
i
];
T
v_new
=
v
[
i
]
*
mu
+
g_val
;
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
(
g_val
+
v_new
*
mu
)
*
lr
;
}
}
else
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
v_new
=
v
[
i
]
*
mu
+
g
[
i
];
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
lr
*
v_new
;
}
}
}
template
<
typename
T
>
__global__
void
SparseMomentumKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
grad_rows
,
const
size_t
grad_row_numel
,
const
size_t
grad_row_size
,
const
T
use_nesterov
,
T
*
p_out
,
T
*
v_out
)
{
for
(
int
i
=
blockIdx
.
x
;
i
<
grad_row_size
;
i
+=
gridDim
.
x
)
{
for
(
int
j
=
threadIdx
.
x
;
j
<
grad_row_numel
;
j
+=
blockDim
.
x
)
{
size_t
p_i
=
grad_rows
[
i
]
*
grad_row_numel
+
j
;
size_t
g_i
=
i
*
grad_row_numel
+
j
;
v_out
[
g_i
]
=
v
[
g_i
]
*
mu
+
g
[
g_i
];
if
(
use_nesterov
)
{
p_out
[
p_i
]
=
p
[
p_i
]
-
(
g
[
g_i
]
+
v_out
[
g_i
]
*
mu
)
*
lr
[
0
];
}
else
{
p_out
[
p_i
]
=
p
[
p_i
]
-
v_out
[
g_i
]
*
lr
[
0
];
}
}
}
}
template
<
typename
T
>
class
MomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
velocity_var
=
ctx
.
InputVar
(
"Velocity"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
PADDLE_ENFORCE
(
velocity_var
->
IsType
<
framework
::
LoDTensor
>
(),
"Unmatched Type of Param and Grad"
);
auto
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
v_out
=
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
p
=
param
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
const
int
kThreadPerBlock
=
256
;
int
grid
=
(
param
->
numel
()
+
kThreadPerBlock
-
1
)
/
kThreadPerBlock
;
MomentumKernel
<
T
><<<
grid
,
kThreadPerBlock
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
use_nesterov
,
p_out
,
v_out
);
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
// sparse update embedding with selectedrows
PADDLE_ENFORCE
(
velocity_var
->
IsType
<
framework
::
SelectedRows
>
(),
"Unmatched Type of Param and Grad"
);
auto
velocity
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
SelectedRows
>
(
"VelocityOut"
);
// sparse update maybe empty.
if
(
grad
->
rows
().
size
()
==
0
)
{
return
;
}
PADDLE_ENFORCE
(
grad
->
height
()
==
velocity
->
height
(),
"Unmatched gradient and velocity."
);
auto
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
v_out
=
velocity_out
->
mutable_value
()
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
p
=
param
->
data
<
T
>
();
auto
*
g
=
grad
->
value
().
data
<
T
>
();
auto
*
v
=
velocity
->
value
().
data
<
T
>
();
size_t
grad_row_numel
=
grad
->
value
().
numel
()
/
grad
->
rows
().
size
();
size_t
grad_row_size
=
grad
->
rows
().
size
();
framework
::
Vector
<
int64_t
>
rows
(
grad
->
rows
());
const
int
kThreadPerBlock
=
256
;
int
grid
=
(
param
->
numel
()
+
kThreadPerBlock
-
1
)
/
kThreadPerBlock
;
SparseMomentumKernel
<
T
><<<
grid
,
kThreadPerBlock
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
rows
.
CUDAData
(
ctx
.
GetPlace
()),
grad_row_numel
,
grad
->
rows
().
size
(),
use_nesterov
,
p_out
,
v_out
);
}
else
{
PADDLE_THROW
(
"Unsupported Variable Type of Grad"
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
momentum
,
ops
::
MomentumOpCUDAKernel
<
float
>
,
ops
::
MomentumOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/momentum_op.h
浏览文件 @
a9f5f822
...
...
@@ -15,11 +15,265 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
using
framework
::
SelectedRows
;
struct
NoNesterov
;
struct
UseNesterov
;
template
<
typename
T
>
class
CPUDenseMomentumFunctor
{
private:
const
Tensor
*
param
;
const
Tensor
*
grad
;
const
Tensor
*
velocity
;
const
Tensor
*
learning_rate
;
const
T
mu
;
const
T
use_nesterov
;
Tensor
*
param_out
;
Tensor
*
velocity_out
;
public:
CPUDenseMomentumFunctor
(
const
Tensor
*
param
,
const
Tensor
*
grad
,
const
Tensor
*
velocity
,
const
Tensor
*
learning_rate
,
const
T
mu
,
const
bool
use_nesterov
,
Tensor
*
param_out
,
Tensor
*
velocity_out
)
:
param
(
param
),
grad
(
grad
),
velocity
(
velocity
),
learning_rate
(
learning_rate
),
mu
(
mu
),
use_nesterov
(
use_nesterov
),
param_out
(
param_out
),
velocity_out
(
velocity_out
)
{}
inline
void
operator
()()
{
auto
p_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
v_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity_out
);
auto
p
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param
);
auto
v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity
);
auto
g
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad
);
auto
*
lr
=
learning_rate
->
data
<
T
>
();
v_out
=
v
*
mu
+
g
;
if
(
use_nesterov
)
{
p_out
=
p
-
(
g
+
v_out
*
mu
)
*
lr
[
0
];
}
else
{
p_out
=
p
-
lr
[
0
]
*
v_out
;
}
}
};
template
<
typename
T
,
typename
UpdateMethod
>
class
DenseMomentumFunctor
;
// NOTE(dzh) for performance.
// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
// functor.
template
<
typename
T
>
class
DenseMomentumFunctor
<
T
,
UseNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
learning_rate
),
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
T
>
class
DenseMomentumFunctor
<
T
,
NoNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
learning_rate
),
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
lr
*
v_out
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
// TODO(dzh): enhance speed use eigen
// template<typename T>
// class CPUSparseMomentumFunctor {
// private:
// const T* p_;
// const T* g_;
// const T* v_;
// const T* lr_;
// const T mu_;
// const bool use_nesterov_;
// const int64_t* rows_;
// const int64_t row_numel_;
// const int64_t row_height_;
// T* p_out_;
// T* v_out_;
// public:
// CPUSparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
// const T mu, const bool use_nesterov, const int64_t* rows, const int64_t
// row_numel, const int64_t row_height, T* p_out, T* v_out) :p_(p), g_(g),
// v_(v), lr_(lr), mu_(mu), rows_(rows), row_numel_(row_numel),
// row_height_(row_height), p_out_(p_out), v_out_(v_out) {}
// inline void operator()() {
// }
// };
template
<
typename
T
,
typename
UpdateMethod
>
class
SparseMomentumFunctor
;
template
<
typename
T
>
class
SparseMomentumFunctor
<
T
,
UseNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
*
rows_
;
const
int64_t
row_numel_
;
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
lr
),
mu_
(
mu
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_height_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
g_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
T
>
class
SparseMomentumFunctor
<
T
,
NoNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
*
rows_
;
const
int64_t
row_numel_
;
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
lr
),
mu_
(
mu
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_height_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
g_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
v_out
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
...
...
@@ -29,65 +283,88 @@ class MomentumOpKernel : public framework::OpKernel<T> {
auto
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
velocity_var
=
ctx
.
InputVar
(
"Velocity"
);
auto
*
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
PADDLE_ENFORCE
(
velocity_var
->
IsType
<
framework
::
LoDTensor
>
(),
"Unmatched Type of Param and Grad"
);
auto
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
p_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
v_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity_out
);
auto
p
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param
);
auto
v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity
);
auto
g
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad
);
auto
*
lr
=
learning_rate
->
data
<
T
>
();
v_out
=
v
*
mu
+
g
;
if
(
use_nesterov
)
{
p_out
=
p
-
(
g
+
v_out
*
mu
)
*
lr
[
0
];
}
else
{
p_out
=
p
-
lr
[
0
]
*
v_out
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
CPUDenseMomentumFunctor
<
T
>
functor
(
param
,
grad
,
velocity
,
learning_rate
,
mu
,
use_nesterov
,
param_out
,
velocity_out
);
functor
();
}
else
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
->
numel
());
if
(
use_nesterov
)
{
DenseMomentumFunctor
<
T
,
UseNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
else
{
DenseMomentumFunctor
<
T
,
NoNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
}
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
// sparse update embedding with selectedrows
PADDLE_ENFORCE
(
velocity_var
->
IsType
<
framework
::
SelectedRows
>
(),
"Unmatched Type of Param and Grad"
);
auto
velocity
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
SelectedRows
>
(
"VelocityOut"
);
// sparse update maybe empty.
if
(
grad
->
rows
().
size
()
==
0
)
{
VLOG
(
3
)
<<
"Grad SelectedRows contains no data!"
;
return
;
}
PADDLE_ENFORCE
(
grad
->
height
()
==
velocity
->
height
(),
"Unmatched gradient and velocity."
);
auto
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
v_out
=
velocity_out
->
mutable_value
()
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
p
=
param
->
data
<
T
>
();
auto
*
g
=
grad
->
value
().
data
<
T
>
();
auto
*
v
=
velocity
->
value
().
data
<
T
>
();
size_t
grad_row_numel
=
grad
->
value
().
numel
()
/
grad
->
rows
().
size
();
for
(
size_t
i
=
0
;
i
<
grad
->
rows
().
size
();
++
i
)
{
size_t
grad_row_index
=
grad
->
rows
()[
i
];
for
(
size_t
j
=
0
;
j
<
grad_row_numel
;
++
j
)
{
size_t
p_i
=
grad_row_index
*
grad_row_numel
+
j
;
size_t
g_i
=
i
*
grad_row_numel
+
j
;
v_out
[
g_i
]
=
v
[
g_i
]
*
mu
+
g
[
g_i
];
if
(
use_nesterov
)
{
p_out
[
p_i
]
=
p
[
p_i
]
-
(
g
[
g_i
]
+
v_out
[
g_i
]
*
mu
)
*
lr
[
0
];
}
else
{
p_out
[
p_i
]
=
p
[
p_i
]
-
v_out
[
g_i
]
*
lr
[
0
];
}
}
auto
*
merged_grad
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
()
->
GetMutable
<
framework
::
SelectedRows
>
();
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
merge_func
(
ctx
.
template
device_context
<
DeviceContext
>(),
*
grad
,
merged_grad
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
->
numel
());
const
int64_t
*
rows
=
nullptr
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
rows
=
merged_grad
->
rows
().
CUDAData
(
ctx
.
GetPlace
());
}
else
{
rows
=
merged_grad
->
rows
().
data
();
}
if
(
use_nesterov
)
{
SparseMomentumFunctor
<
T
,
UseNesterov
>
functor
(
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
static_cast
<
int64_t
>
(
merged_grad
->
height
()),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
else
{
SparseMomentumFunctor
<
T
,
NoNesterov
>
functor
(
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
static_cast
<
int64_t
>
(
merged_grad
->
height
()),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
}
else
{
PADDLE_THROW
(
"Unsupported Variable Type of Grad"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录