Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
6a4e9230
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6a4e9230
编写于
10月 18, 2018
作者:
T
Tao Luo
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into mkldnn_test
上级
b8196843
078223b3
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
689 addition
and
170 deletion
+689
-170
benchmark/fluid/run.sh
benchmark/fluid/run.sh
+0
-0
paddle/fluid/framework/op_desc.h
paddle/fluid/framework/op_desc.h
+0
-10
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+1
-3
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+1
-3
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+2
-5
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+2
-6
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+1
-2
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+2
-5
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+1
-1
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+74
-15
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+2
-0
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+1
-1
paddle/fluid/operators/momentum_op.cc
paddle/fluid/operators/momentum_op.cc
+40
-13
paddle/fluid/operators/momentum_op.cu
paddle/fluid/operators/momentum_op.cu
+3
-72
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+311
-20
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+20
-6
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+1
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+3
-1
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+6
-2
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+3
-2
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+4
-2
python/paddle/fluid/tests/unittests/test_momentum_op.py
python/paddle/fluid/tests/unittests/test_momentum_op.py
+94
-0
python/paddle/utils/__init__.py
python/paddle/utils/__init__.py
+2
-1
python/paddle/utils/plot.py
python/paddle/utils/plot.py
+115
-0
未找到文件。
benchmark/fluid/run.sh
100644 → 100755
浏览文件 @
6a4e9230
文件模式从 100644 更改为 100755
paddle/fluid/framework/op_desc.h
浏览文件 @
6a4e9230
...
...
@@ -100,16 +100,6 @@ class OpDesc {
std
::
vector
<
std
::
string
>
InputNames
()
const
{
return
MapKeys
(
inputs_
);
}
std
::
vector
<
std
::
string
>
OutputNames
()
const
{
return
MapKeys
(
outputs_
);
}
void
SetInputMap
(
const
VariableNameMap
&
input
)
{
this
->
inputs_
=
input
;
this
->
need_update_
=
true
;
}
void
SetOutputMap
(
const
VariableNameMap
&
output
)
{
this
->
outputs_
=
output
;
this
->
need_update_
=
true
;
}
const
VariableNameMap
&
Inputs
()
const
{
return
inputs_
;
}
const
VariableNameMap
&
Outputs
()
const
{
return
outputs_
;
}
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
6a4e9230
...
...
@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
config
.
model_dir
=
model_path
;
config
.
use_gpu
=
false
;
config
.
device
=
0
;
auto
predictor
=
::
paddle
::
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
auto
predictor
=
::
paddle
::
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
// One single batch
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
6a4e9230
...
...
@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
config
.
model_dir
=
FLAGS_dirname
+
"/word2vec.inference.model"
;
config
.
use_feed_fetch_ops
=
false
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
w0
=
predictor
->
GetInputTensor
(
"firstw"
);
auto
w1
=
predictor
->
GetInputTensor
(
"secondw"
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
6a4e9230
...
...
@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config1
.
device
=
0
;
config1
.
max_batch_size
=
10
;
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config0
);
auto
predictor1
=
CreatePaddlePredictor
<
MixedRTConfig
,
PaddleEngineKind
::
kAutoMixedTensorRT
>
(
config1
);
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
>
(
config0
);
auto
predictor1
=
CreatePaddlePredictor
<
MixedRTConfig
>
(
config1
);
for
(
int
batch_id
=
0
;
batch_id
<
1
;
batch_id
++
)
{
//# 2. Prepare input.
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
6a4e9230
...
...
@@ -308,18 +308,14 @@ TEST(Analyzer_rnn1, ZeroCopy) {
PaddlePlace
place
;
int
output_size
{
0
};
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
// the analysis predictor needs feed/fetch.
auto
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
auto
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
6a4e9230
...
...
@@ -79,8 +79,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
const
AnalysisConfig
&
config
,
bool
use_analysis
=
true
)
{
if
(
use_analysis
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
config
);
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
config
);
}
else
{
return
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
6a4e9230
...
...
@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
config1
.
model_dir
=
model_dirname
;
config1
.
max_batch_size
=
batch_size
;
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config0
);
auto
predictor1
=
CreatePaddlePredictor
<
MixedRTConfig
,
PaddleEngineKind
::
kAutoMixedTensorRT
>
(
config1
);
auto
predictor0
=
CreatePaddlePredictor
<
NativeConfig
>
(
config0
);
auto
predictor1
=
CreatePaddlePredictor
<
MixedRTConfig
>
(
config1
);
// Prepare inputs
int
height
=
224
;
int
width
=
224
;
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
6a4e9230
...
...
@@ -20,7 +20,7 @@ if(WITH_GRPC)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL
)
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL
)
cc_test
(
varhandle_test SRCS varhandle_test.cc
)
cc_test
(
varhandle_test SRCS varhandle_test.cc
DEPS profiler
)
return
()
endif
()
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
6a4e9230
...
...
@@ -73,10 +73,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
SendProcessor
*
s
=
new
SendProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"Send"
,
var_name_val
,
p_ctx
,
p_scope
));
const
std
::
string
method
=
"SendRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
s
->
Prepare
(
h
,
time_out
);
framework
::
AsyncIO
([
var_name_val
,
p_scope
,
p_ctx
,
s
,
this
]
{
framework
::
AsyncIO
([
var_name_val
,
p_scope
,
p_ctx
,
s
,
method
,
h
,
this
]
{
auto
*
var
=
p_scope
->
FindVar
(
var_name_val
);
::
grpc
::
ByteBuffer
req
;
...
...
@@ -87,10 +88,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
nullptr
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/SendVariable"
,
req
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
});
req_count_
++
;
...
...
@@ -122,10 +129,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"Get"
,
var_name_val
,
p_ctx
,
p_scope
));
const
std
::
string
method
=
"GetRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
s
->
Prepare
(
h
,
time_out
);
framework
::
AsyncIO
([
var_name_val
,
s
,
this
]
{
framework
::
AsyncIO
([
var_name_val
,
s
,
method
,
p_ctx
,
h
,
this
]
{
// prepare input
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
var_name_val
);
...
...
@@ -137,10 +145,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/GetVariable"
,
buf
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
});
req_count_
++
;
...
...
@@ -161,12 +175,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"Prefetch"
,
out_var_name_val
,
p_ctx
,
p_scope
));
const
std
::
string
method
=
"PrefetchRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
out_var_name_val
,
p_ctx
,
p_scope
));
s
->
Prepare
(
h
,
time_out
);
framework
::
AsyncIO
([
in_var_name_val
,
out_var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
s
,
this
]
{
s
,
method
,
h
,
this
]
{
auto
*
var
=
p_scope
->
FindVar
(
in_var_name_val
);
::
grpc
::
ByteBuffer
req
;
...
...
@@ -177,11 +193,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/PrefetchVariable"
,
req
,
&
cq_
);
call
->
StartCall
();
call
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
static_cast
<
void
*>
(
s
));
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
});
req_count_
++
;
...
...
@@ -193,15 +215,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
const
auto
ch
=
GetChannel
(
ep
);
BatchBarrierProcessor
*
s
=
new
BatchBarrierProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"BatchBarrier"
,
BATCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
const
std
::
string
method
=
"BatchBarrierRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
BATCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
s
->
Prepare
(
h
,
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
BATCH_BARRIER_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
return
h
;
}
...
...
@@ -209,15 +240,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
int64_t
time_out
)
{
const
auto
ch
=
GetChannel
(
ep
);
FetchBarrierProcessor
*
s
=
new
FetchBarrierProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"FetchBarrier"
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
const
std
::
string
method
=
"FetchBarrierRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
s
->
Prepare
(
h
,
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
return
h
;
}
...
...
@@ -226,15 +266,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
const
auto
ch
=
GetChannel
(
ep
);
BatchBarrierProcessor
*
s
=
new
BatchBarrierProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"SendComplete"
,
COMPLETE_MESSAGE
,
nullptr
,
nullptr
));
const
std
::
string
method
=
"SendCompleteRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
COMPLETE_MESSAGE
,
nullptr
,
nullptr
));
s
->
Prepare
(
h
,
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
COMPLETE_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
return
h
;
}
...
...
@@ -244,17 +292,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const
auto
ch
=
GetChannel
(
ep
);
CheckpointNotifyProcessor
*
s
=
new
CheckpointNotifyProcessor
(
ch
);
VarHandlePtr
h
(
new
VarHandle
(
ep
,
"CheckPointNotify"
,
CHECKPOINT_SAVE_MESSAGE
,
nullptr
,
nullptr
));
const
std
::
string
method
=
"CheckPointNotifyRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
CHECKPOINT_SAVE_MESSAGE
,
nullptr
,
nullptr
));
s
->
Prepare
(
h
,
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
CHECKPOINT_SAVE_MESSAGE
);
req
.
set_out_varname
(
dir
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncCheckpointNotify
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
h
->
Wait
();
}
return
h
;
}
...
...
@@ -273,6 +331,7 @@ void GRPCClient::Proceed() {
BaseProcessor
*
c
=
static_cast
<
BaseProcessor
*>
(
tag
);
GPR_ASSERT
(
ok
);
PADDLE_ENFORCE
(
c
);
if
(
c
->
status_
.
ok
())
{
VLOG
(
3
)
<<
c
->
GetVarHandlePtr
()
->
String
()
<<
" process"
;
c
->
Process
();
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
6a4e9230
...
...
@@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_name
)
{
platform
::
RecordEvent
record_event
(
"serial"
,
&
ctx
);
// Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed.
DestroyCallback
destroy_callback
=
[](
void
*
backing
)
{};
...
...
@@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
*
scope
,
framework
::
Variable
**
var
)
{
platform
::
RecordEvent
record_event
(
"deserial"
,
&
ctx
);
operators
::
distributed
::
GRPCVariableResponse
resp
(
scope
,
&
ctx
);
PADDLE_ENFORCE
(
resp
.
Parse
(
msg
)
==
0
,
"parse bytebuffer to tensor error!"
);
*
var
=
resp
.
GetVar
();
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
6a4e9230
...
...
@@ -66,7 +66,7 @@ static void ParallelExecuteBlocks(
<<
"pointer: "
<<
prepared
[
run_block
].
get
();
executor
->
RunPreparedContext
(
prepared
[
run_block
].
get
(),
scope
);
}
catch
(
const
std
::
exception
&
e
)
{
LOG
(
ERROR
)
<<
"run sub program
error "
<<
e
.
what
();
LOG
(
FATAL
)
<<
"run sub program:"
<<
idx
<<
"
error "
<<
e
.
what
();
}
}));
}
...
...
paddle/fluid/operators/momentum_op.cc
浏览文件 @
6a4e9230
...
...
@@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(param) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
...
...
@@ -45,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel {
"Output(VelocityOut) of Momentum should not be null."
);
auto
param_dim
=
ctx
->
GetInputDim
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Grad"
),
"Param and Grad input of MomentumOp should have the same dimension."
);
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Velocity"
),
"Param and Velocity of MomentumOp should have the same dimension."
);
if
(
ctx
->
GetInputsVarType
(
"Grad"
)[
0
]
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Grad"
),
"Param and Grad input of MomentumOp should have the same dimension."
);
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Velocity"
),
"Param and Velocity of MomentumOp should have the same dimension."
);
}
PADDLE_ENFORCE_EQ
(
framework
::
product
(
ctx
->
GetInputDim
(
"LearningRate"
)),
1
,
"Learning_rate should be a scalar"
);
...
...
@@ -58,13 +61,34 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Param"
)
->
type
());
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
class
MomentumOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
input_var
=
op_desc
.
Input
(
"Param"
)[
0
];
for
(
auto
&
out_var
:
op_desc
.
Output
(
"ParamOut"
))
{
if
(
block
->
FindRecursiveOrCreateVar
(
input_var
).
GetType
()
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
if
(
block
->
FindRecursiveOrCreateVar
(
input_var
).
GetType
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
else
{
PADDLE_THROW
(
"Only support LodTensor and SelectedRows, Unexpected Input Type."
);
}
}
}
};
class
MomentumOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
...
...
@@ -115,6 +139,9 @@ $$
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_WITHOUT_GRADIENT
(
momentum
,
ops
::
MomentumOp
,
ops
::
MomentumOpMaker
);
REGISTER_OP_CPU_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
float
>
,
ops
::
MomentumOpKernel
<
double
>
);
REGISTER_OPERATOR
(
momentum
,
ops
::
MomentumOp
,
ops
::
MomentumOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
MomentumOpInferVarType
);
REGISTER_OP_CPU_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/momentum_op.cu
浏览文件 @
6a4e9230
...
...
@@ -15,76 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/momentum_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
MomentumKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
bool
use_nesterov
,
T
*
p_out
,
T
*
v_out
)
{
T
lr
=
learning_rate
[
0
];
if
(
use_nesterov
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
g_val
=
g
[
i
];
T
v_new
=
v
[
i
]
*
mu
+
g_val
;
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
(
g_val
+
v_new
*
mu
)
*
lr
;
}
}
else
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
v_new
=
v
[
i
]
*
mu
+
g
[
i
];
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
lr
*
v_new
;
}
}
}
template
<
typename
T
>
class
MomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
v_out
=
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
*
p
=
param
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
int
block
=
512
;
int
grid
=
(
param
->
numel
()
+
block
-
1
)
/
block
;
MomentumKernel
<
T
><<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
use_nesterov
,
p_out
,
v_out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
momentum
,
ops
::
MomentumOpCUDAKernel
<
float
>
,
ops
::
MomentumOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
momentum
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MomentumOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/momentum_op.h
浏览文件 @
6a4e9230
...
...
@@ -13,35 +13,48 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
using
framework
::
Tensor
;
using
framework
::
SelectedRows
;
struct
NoNesterov
;
struct
UseNesterov
;
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
template
<
typename
T
>
class
CPUDenseMomentumFunctor
{
private:
const
Tensor
*
param
;
const
Tensor
*
grad
;
const
Tensor
*
velocity
;
const
Tensor
*
learning_rate
;
const
T
mu
;
const
T
use_nesterov
;
Tensor
*
param_out
;
Tensor
*
velocity_out
;
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
public:
CPUDenseMomentumFunctor
(
const
Tensor
*
param
,
const
Tensor
*
grad
,
const
Tensor
*
velocity
,
const
Tensor
*
learning_rate
,
const
T
mu
,
const
bool
use_nesterov
,
Tensor
*
param_out
,
Tensor
*
velocity_out
)
:
param
(
param
),
grad
(
grad
),
velocity
(
velocity
),
learning_rate
(
learning_rate
),
mu
(
mu
),
use_nesterov
(
use_nesterov
),
param_out
(
param_out
),
velocity_out
(
velocity_out
)
{}
inline
void
operator
()()
{
auto
p_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
v_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity_out
);
...
...
@@ -59,5 +72,283 @@ class MomentumOpKernel : public framework::OpKernel<T> {
}
};
template
<
typename
T
,
typename
UpdateMethod
>
class
DenseMomentumFunctor
;
// NOTE(dzh) for performance.
// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
// functor.
template
<
typename
T
>
class
DenseMomentumFunctor
<
T
,
UseNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
learning_rate
),
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
T
>
class
DenseMomentumFunctor
<
T
,
NoNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
num_
;
T
*
p_out_
;
T
*
v_out_
;
public:
DenseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
learning_rate
),
mu_
(
mu
),
num_
(
num
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
g
=
g_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
lr
*
v_out
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
T
,
typename
UpdateMethod
>
class
SparseMomentumFunctor
;
template
<
typename
T
>
class
SparseMomentumFunctor
<
T
,
UseNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
*
rows_
;
const
int64_t
row_numel_
;
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
lr
),
mu_
(
mu
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_height_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
g_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
(
g
+
v_out
*
mu_
)
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
T
>
class
SparseMomentumFunctor
<
T
,
NoNesterov
>
{
private:
const
T
*
p_
;
const
T
*
g_
;
const
T
*
v_
;
const
T
*
lr_
;
const
T
mu_
;
const
int64_t
*
rows_
;
const
int64_t
row_numel_
;
const
int64_t
row_height_
;
T
*
p_out_
;
T
*
v_out_
;
public:
SparseMomentumFunctor
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
lr
,
const
T
mu
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_height
,
T
*
p_out
,
T
*
v_out
)
:
p_
(
p
),
g_
(
g
),
v_
(
v
),
lr_
(
lr
),
mu_
(
mu
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_height_
(
row_height
),
p_out_
(
p_out
),
v_out_
(
v_out
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_height_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
g_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
// put memory access in register
const
T
p
=
p_
[
i
];
const
T
lr
=
lr_
[
0
];
const
T
v
=
v_
[
i
];
T
v_out
=
v
*
mu_
+
g
;
T
p_out
=
p
-
v_out
*
lr
;
// write reigster to memory
v_out_
[
i
]
=
v_out
;
p_out_
[
i
]
=
p_out
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
velocity
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Velocity"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
);
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
CPUDenseMomentumFunctor
<
T
>
functor
(
param
,
grad
,
velocity
,
learning_rate
,
mu
,
use_nesterov
,
param_out
,
velocity_out
);
functor
();
}
else
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
->
numel
());
if
(
use_nesterov
)
{
DenseMomentumFunctor
<
T
,
UseNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
else
{
DenseMomentumFunctor
<
T
,
NoNesterov
>
functor
(
param
->
data
<
T
>
(),
grad
->
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
param
->
numel
(),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
}
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
// sparse update embedding with selectedrows
auto
grad
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
);
// sparse update maybe empty.
if
(
grad
->
rows
().
size
()
==
0
)
{
VLOG
(
3
)
<<
"Grad SelectedRows contains no data!"
;
return
;
}
auto
*
merged_grad
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
()
->
GetMutable
<
framework
::
SelectedRows
>
();
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
merge_func
(
ctx
.
template
device_context
<
DeviceContext
>(),
*
grad
,
merged_grad
);
const
int64_t
*
rows
=
nullptr
;
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
rows
=
merged_grad
->
rows
().
CUDAData
(
ctx
.
GetPlace
());
}
else
{
#endif
rows
=
merged_grad
->
rows
().
data
();
#ifdef PADDLE_WITH_CUDA
}
#endif
int64_t
row_numel
=
merged_grad
->
value
().
numel
()
/
merged_grad
->
rows
().
size
();
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
->
numel
());
if
(
use_nesterov
)
{
SparseMomentumFunctor
<
T
,
UseNesterov
>
functor
(
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
row_numel
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
else
{
SparseMomentumFunctor
<
T
,
NoNesterov
>
functor
(
param
->
data
<
T
>
(),
merged_grad
->
value
().
data
<
T
>
(),
velocity
->
data
<
T
>
(),
learning_rate
->
data
<
T
>
(),
mu
,
rows
,
row_numel
,
static_cast
<
int64_t
>
(
merged_grad
->
rows
().
size
()),
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
for_range
(
functor
);
}
}
else
{
PADDLE_THROW
(
string
::
Sprintf
(
"MomentumOp only supports LoDTensor or SelectedRows "
"gradient, but the received Variable Type is %s"
,
grad_var
->
Type
().
name
()));
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sum_op.h
浏览文件 @
6a4e9230
...
...
@@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel<T> {
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
int
start
=
in_place
?
1
:
0
;
if
(
!
in_place
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
if
((
in_num
>=
2
)
&&
in_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
()
&&
in_vars
[
1
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_0
=
in_vars
[
0
]
->
Get
<
framework
::
LoDTensor
>
();
auto
&
in_1
=
in_vars
[
1
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
in_0
.
numel
()
&&
in_1
.
numel
())
{
auto
in_0_e
=
EigenVector
<
T
>::
Flatten
(
in_0
);
auto
in_1_e
=
EigenVector
<
T
>::
Flatten
(
in_1
);
result
.
device
(
place
)
=
in_0_e
+
in_1_e
;
start
=
2
;
}
}
if
(
start
!=
2
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
}
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
// If in_place, just skip the first tensor
for
(
size_t
i
=
in_place
?
1
:
0
;
i
<
in_num
;
i
++
)
{
for
(
size_t
i
=
start
;
i
<
in_num
;
i
++
)
{
if
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_t
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensor
>
();
if
(
in_t
.
numel
()
==
0
)
{
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
6a4e9230
...
...
@@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct
RecordEvent
{
// dev_ctx can be set to nullptr if device is cpu.
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
~
RecordEvent
();
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
6a4e9230
...
...
@@ -390,7 +390,9 @@ function run_mac_test() {
Running unit tests ...
========================================
EOF
#remove proxy here to fix dist error on mac
export
http_proxy
=
export
https_proxy
=
# TODO: jiabin need to refine this part when these tests fixed on mac
ctest
--output-on-failure
-j
$1
# make install should also be test when unittest
...
...
python/paddle/fluid/framework.py
浏览文件 @
6a4e9230
...
...
@@ -1522,13 +1522,17 @@ class Program(object):
>>> with program.lr_schedule_guard():
>>> lr = lr * decay
"""
tmp_role
=
self
.
_current_role
tmp_var
=
self
.
_op_role_var
OpRole
=
core
.
op_proto_and_checker_maker
.
OpRole
self
.
_current_role
=
OpRole
.
LRSched
# TODO(typhoonzero): how to set target learning rate var
self
.
_op_role_var
=
[]
yield
self
.
_op_role_var
=
[]
self
.
_current_role
=
OpRole
.
Forward
self
.
_op_role_var
=
tmp_var
self
.
_current_role
=
tmp_role
def
__str__
(
self
):
"""
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
6a4e9230
...
...
@@ -15,7 +15,7 @@
from
__future__
import
print_function
import
re
from
collections
import
defaultdict
from
paddle.fluid.framework
import
Program
,
Variable
,
name_scope
from
paddle.fluid.framework
import
Program
,
Variable
,
name_scope
,
default_main_program
from
.
import
framework
from
.
import
layers
from
.backward
import
append_backward
...
...
@@ -111,7 +111,8 @@ class Optimizer(object):
if
param_lr
==
1.0
:
return
self
.
_global_learning_rate
()
else
:
return
self
.
_global_learning_rate
()
*
param_lr
with
default_main_program
().
_lr_schedule_guard
():
return
self
.
_global_learning_rate
()
*
param_lr
def
_create_accumulators
(
self
,
block
,
parameters
):
"""Create all accumulators needed by the parameters
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
6a4e9230
...
...
@@ -91,6 +91,8 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
need_envs
=
need_envs
)
# FIXME(tangwei): Learningrate variable is not created on pserver.
"""
class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
def _setup_config(self):
self._sync_mode = True
...
...
@@ -105,7 +107,7 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
self.check_with_place(
"dist_simnet_bow.py",
delta=1e-5,
check_error_log
=
Fals
e
,
check_error_log=
Tru
e,
need_envs=need_envs)
...
...
@@ -143,7 +145,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
delta=1e-5,
check_error_log=False,
need_envs=need_envs)
"""
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_momentum_op.py
浏览文件 @
6a4e9230
...
...
@@ -16,6 +16,8 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
op_test
import
OpTest
...
...
@@ -88,5 +90,97 @@ class TestMomentumOp2(OpTest):
self
.
check_output
()
class
TestSparseMomentumOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
use_nesterov
=
False
def
check_with_place
(
self
,
place
):
self
.
init_kernel
()
scope
=
core
.
Scope
()
# create and initialize Grad Variable
height
=
10
rows
=
[
0
,
4
,
7
]
row_numel
=
12
mu
=
1.0
use_nesterov
=
self
.
use_nesterov
# create and initialize Param Variable
param
=
scope
.
var
(
'Param'
).
get_tensor
()
param_array
=
np
.
full
((
height
,
row_numel
),
5.0
).
astype
(
"float32"
)
param
.
set
(
param_array
,
place
)
param_out
=
scope
.
var
(
"ParamOut"
).
get_tensor
()
param_out_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
param_out
.
set
(
param_out_array
,
place
)
grad_selected_rows
=
scope
.
var
(
'Grad'
).
get_selected_rows
()
grad_selected_rows
.
set_height
(
height
)
grad_selected_rows
.
set_rows
(
rows
)
grad_np_array
=
np
.
ones
((
len
(
rows
),
row_numel
)).
astype
(
"float32"
)
grad_np_array
[
0
,
0
]
=
2.0
grad_np_array
[
2
,
8
]
=
4.0
grad_tensor
=
grad_selected_rows
.
get_tensor
()
grad_tensor
.
set
(
grad_np_array
,
place
)
velocity
=
scope
.
var
(
'Velocity'
).
get_tensor
()
velocity_np_array
=
np
.
ones
((
height
,
row_numel
)).
astype
(
"float32"
)
velocity
.
set
(
velocity_np_array
,
place
)
velocity_out
=
scope
.
var
(
'VelocityOut'
).
get_tensor
()
velocity_out_np_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
velocity_out
.
set
(
velocity_out_np_array
,
place
)
# create and initialize LeraningRate Variable
lr
=
scope
.
var
(
'LearningRate'
).
get_tensor
()
lr_array
=
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
lr
.
set
(
lr_array
,
place
)
# create and run operator
op
=
Operator
(
"momentum"
,
Param
=
'Param'
,
Grad
=
'Grad'
,
Velocity
=
'Velocity'
,
ParamOut
=
'ParamOut'
,
VelocityOut
=
'VelocityOut'
,
LearningRate
=
'LearningRate'
,
mu
=
mu
,
use_nesterov
=
use_nesterov
)
op
.
run
(
scope
,
place
)
# get and compare result
param_out_np_array
=
np
.
array
(
param_out
)
velocity_out_np_array
=
np
.
array
(
velocity_out
)
# TODO(dzh): add a more suitable general numpy interface
# for sparse update.
_grad_np_array
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
for
i
in
range
(
len
(
rows
)):
_grad_np_array
[
rows
[
i
]]
=
grad_np_array
[
i
]
_velocity_out
=
mu
*
velocity_np_array
+
_grad_np_array
_param
=
param_array
if
use_nesterov
:
_param_out
=
_param
-
(
_grad_np_array
+
_velocity_out
*
mu
)
*
lr_array
else
:
_param_out
=
_param
-
lr_array
*
_velocity_out
self
.
assertTrue
((
_velocity_out
==
velocity_out_np_array
).
all
())
self
.
assertTrue
((
_param_out
==
param_out_np_array
).
all
())
def
init_kernel
(
self
):
pass
def
test_sparse_momentum
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
class
TestSparseMomentumOp2
(
TestSparseMomentumOp
):
def
init_kernel
(
self
):
self
.
use_nesterov
=
True
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/utils/__init__.py
浏览文件 @
6a4e9230
...
...
@@ -12,4 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__all__
=
[
'dump_config'
]
from
plot
import
Ploter
__all__
=
[
'dump_config'
,
'Ploter'
]
python/paddle/utils/plot.py
0 → 100644
浏览文件 @
6a4e9230
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
class
PlotData
(
object
):
def
__init__
(
self
):
self
.
step
=
[]
self
.
value
=
[]
def
append
(
self
,
step
,
value
):
self
.
step
.
append
(
step
)
self
.
value
.
append
(
value
)
def
reset
(
self
):
self
.
step
=
[]
self
.
value
=
[]
class
Ploter
(
object
):
"""
Plot input data in a 2D graph
Args:
title: assign the title of input data.
step: x_axis of the data.
value: y_axis of the data.
"""
def
__init__
(
self
,
*
args
):
self
.
__args__
=
args
self
.
__plot_data__
=
{}
for
title
in
args
:
self
.
__plot_data__
[
title
]
=
PlotData
()
# demo in notebooks will use Ploter to plot figure, but when we convert
# the ipydb to py file for testing, the import of matplotlib will make the
# script crash. So we can use `export DISABLE_PLOT=True` to disable import
# these libs
self
.
__disable_plot__
=
os
.
environ
.
get
(
"DISABLE_PLOT"
)
if
not
self
.
__plot_is_disabled__
():
import
matplotlib.pyplot
as
plt
from
IPython
import
display
self
.
plt
=
plt
self
.
display
=
display
def
__plot_is_disabled__
(
self
):
return
self
.
__disable_plot__
==
"True"
def
append
(
self
,
title
,
step
,
value
):
"""
Feed data
Args:
title: assign the group data to this subtitle.
step: the x_axis of data.
value: the y_axis of data.
Examples:
.. code-block:: python
plot_curve = Ploter("Curve 1","Curve 2")
plot_curve.append(title="Curve 1",step=1,value=1)
"""
assert
isinstance
(
title
,
basestring
)
assert
self
.
__plot_data__
.
has_key
(
title
)
data
=
self
.
__plot_data__
[
title
]
assert
isinstance
(
data
,
PlotData
)
data
.
append
(
step
,
value
)
def
plot
(
self
,
path
=
None
):
"""
Plot data in a 2D graph
Args:
path: store the figure to this file path. Defaul None.
Examples:
.. code-block:: python
plot_curve = Ploter()
plot_cure.plot()
"""
if
self
.
__plot_is_disabled__
():
return
titles
=
[]
for
title
in
self
.
__args__
:
data
=
self
.
__plot_data__
[
title
]
assert
isinstance
(
data
,
PlotData
)
if
len
(
data
.
step
)
>
0
:
titles
.
append
(
title
)
self
.
plt
.
plot
(
data
.
step
,
data
.
value
)
self
.
plt
.
legend
(
titles
,
loc
=
'upper left'
)
if
path
is
None
:
self
.
display
.
clear_output
(
wait
=
True
)
self
.
display
.
display
(
self
.
plt
.
gcf
())
else
:
self
.
plt
.
savefig
(
path
)
self
.
plt
.
gcf
().
clear
()
def
reset
(
self
):
for
key
in
self
.
__plot_data__
:
data
=
self
.
__plot_data__
[
key
]
assert
isinstance
(
data
,
PlotData
)
data
.
reset
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录