Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2a672d24
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2a672d24
编写于
7月 16, 2018
作者:
L
Luo Tao
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into demo
上级
09a75882
f02a4da6
变更
39
显示空白变更内容
内联
并排
Showing
39 changed file
with
484 addition
and
147 deletion
+484
-147
CMakeLists.txt
CMakeLists.txt
+11
-0
Dockerfile
Dockerfile
+1
-1
cmake/external/libxsmm.cmake
cmake/external/libxsmm.cmake
+57
-0
cmake/external/openblas.cmake
cmake/external/openblas.cmake
+5
-0
cmake/external/python.cmake
cmake/external/python.cmake
+3
-2
paddle/fluid/framework/details/multi_devices_graph_builder.cc
...le/fluid/framework/details/multi_devices_graph_builder.cc
+26
-12
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+23
-7
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-1
paddle/fluid/operators/checkpoint_notify_op.cc
paddle/fluid/operators/checkpoint_notify_op.cc
+1
-1
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+11
-2
paddle/fluid/operators/distributed/grpc_client.h
paddle/fluid/operators/distributed/grpc_client.h
+3
-2
paddle/fluid/operators/distributed/rpc_client.h
paddle/fluid/operators/distributed/rpc_client.h
+1
-1
paddle/fluid/operators/fetch_barrier_op.cc
paddle/fluid/operators/fetch_barrier_op.cc
+2
-2
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+4
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+58
-2
paddle/fluid/operators/math/math_function_test.cc
paddle/fluid/operators/math/math_function_test.cc
+57
-1
paddle/fluid/operators/prefetch_op.cc
paddle/fluid/operators/prefetch_op.cc
+1
-1
paddle/fluid/operators/recv_op.cc
paddle/fluid/operators/recv_op.cc
+1
-1
paddle/fluid/operators/send_barrier_op.cc
paddle/fluid/operators/send_barrier_op.cc
+2
-2
paddle/fluid/operators/send_op.cc
paddle/fluid/operators/send_op.cc
+1
-1
paddle/fluid/operators/top_k_op.h
paddle/fluid/operators/top_k_op.h
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-2
paddle/legacy/utils/PythonUtil.cpp
paddle/legacy/utils/PythonUtil.cpp
+6
-0
paddle/legacy/utils/PythonUtil.h
paddle/legacy/utils/PythonUtil.h
+28
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+9
-1
python/paddle/__init__.py
python/paddle/__init__.py
+6
-6
python/paddle/dataset/__init__.py
python/paddle/dataset/__init__.py
+14
-14
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+9
-5
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+7
-4
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+19
-13
python/paddle/fluid/regularizer.py
python/paddle/fluid/regularizer.py
+5
-6
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+11
-4
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+22
-26
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
...fluid/tests/unittests/test_parallel_executor_seresnext.py
+59
-14
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+3
-1
python/paddle/reader/__init__.py
python/paddle/reader/__init__.py
+3
-3
python/paddle/reader/decorator.py
python/paddle/reader/decorator.py
+1
-1
python/requirements.txt
python/requirements.txt
+1
-1
python/setup.py.in
python/setup.py.in
+9
-7
未找到文件。
CMakeLists.txt
浏览文件 @
2a672d24
...
...
@@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
option
(
WITH_GRPC
"Use grpc as the default rpc framework"
${
WITH_DISTRIBUTE
}
)
option
(
WITH_BRPC_RDMA
"Use brpc rdma as the rpc protocal"
OFF
)
option
(
WITH_SYSTEM_BLAS
"Use system blas library"
OFF
)
option
(
PY_VERSION
"Compile PaddlePaddle with python3 support"
${
PY_VERSION
}
)
# PY_VERSION
if
(
NOT PY_VERSION
)
set
(
PY_VERSION 2.7
)
endif
()
# CMAKE_BUILD_TYPE
if
(
NOT CMAKE_BUILD_TYPE
)
...
...
@@ -146,6 +152,7 @@ endif()
########################################################################################
include
(
external/mklml
)
# download mklml package
include
(
external/libxsmm
)
# download, build, install libxsmm
include
(
external/zlib
)
# download, build, install zlib
include
(
external/gflags
)
# download, build, install gflags
include
(
external/glog
)
# download, build, install glog
...
...
@@ -232,6 +239,10 @@ if(WITH_MKLML)
list
(
APPEND EXTERNAL_LIBS
${
MKLML_IOMP_LIB
}
)
endif
()
if
(
WITH_LIBXSMM
)
list
(
APPEND EXTERNAL_LIBS
${
LIBXSMM_LIBS
}
)
endif
()
if
(
WITH_MKLDNN
)
list
(
APPEND EXTERNAL_LIBS
${
MKLDNN_LIB
}
)
endif
()
...
...
Dockerfile
浏览文件 @
2a672d24
...
...
@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
pip
install
opencv-python
#For docstring checker
RUN
pip
install
pylint pytest astroid isort
RUN
pip
install
pylint pytest astroid isort
LinkChecker
COPY
./python/requirements.txt /root/
RUN
pip
install
-r
/root/requirements.txt
...
...
cmake/external/libxsmm.cmake
0 → 100644
浏览文件 @
2a672d24
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
OPTION
(
WITH_LIBXSMM
"Compile with libxsmm"
OFF
)
IF
(
NOT WITH_LIBXSMM
)
return
()
ENDIF
()
IF
(
WIN32 OR APPLE OR ANDROID OR IOS
)
MESSAGE
(
WARNING
"Windows, Mac or Mobile are not supported with libxsmm in Paddle yet."
)
SET
(
WITH_LIBXSMM OFF CACHE STRING
"Disable LIBXSMM"
FORCE
)
return
()
ENDIF
()
INCLUDE
(
ExternalProject
)
SET
(
LIBXSMM_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/libxsmm
)
SET
(
LIBXSMM_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/libxsmm
)
SET
(
LIBXSMM_INCLUDE_DIR
"
${
LIBXSMM_INSTALL_DIR
}
/include"
CACHE PATH
"LIBXSMM include directory."
FORCE
)
SET
(
LIBXSMM_LIBRARY_DIR
"
${
LIBXSMM_INSTALL_DIR
}
/lib"
CACHE PATH
"LIBXSMM library directory."
FORCE
)
SET
(
LIBXSMM_LIBS
"
${
LIBXSMM_LIBRARY_DIR
}
/libxsmm.a"
"
${
LIBXSMM_LIBRARY_DIR
}
/libxsmmnoblas.a"
)
ExternalProject_Add
(
extern_libxsmm
GIT_REPOSITORY
"https://github.com/hfp/libxsmm.git"
GIT_TAG
"7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
PREFIX
${
LIBXSMM_SOURCES_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
BUILD_IN_SOURCE 1
BUILD_COMMAND $
(
MAKE
)
--silent PREFIX=
${
LIBXSMM_INSTALL_DIR
}
CXX=g++ CC=gcc WARP=0 install
INSTALL_COMMAND
""
)
ADD_LIBRARY
(
libxsmm STATIC IMPORTED GLOBAL
)
SET_PROPERTY
(
TARGET libxsmm PROPERTY IMPORTED_LOCATION
"
${
LIBXSMM_LIBRARY_DIR
}
/libxsmm.a"
)
SET_PROPERTY
(
TARGET libxsmm PROPERTY IMPORTED_LOCATION
"
${
LIBXSMM_LIBRARY_DIR
}
/libxsmmnoblas.a"
)
MESSAGE
(
STATUS
"Libxsmm library:
${
LIBXSMM_LIBS
}
"
)
include_directories
(
${
LIBXSMM_INCLUDE_DIR
}
)
ADD_DEFINITIONS
(
-DPADDLE_WITH_LIBXSMM
)
ADD_DEPENDENCIES
(
libxsmm extern_libxsmm
)
LIST
(
APPEND external_project_dependencies libxsmm
)
cmake/external/openblas.cmake
浏览文件 @
2a672d24
...
...
@@ -121,6 +121,11 @@ ELSE()
TARGET_LINK_LIBRARIES
(
cblas
${
CBLAS_LIBRARIES
}
)
ENDIF
(
"
${
CBLAS_PROVIDER
}
"
STREQUAL
"MKLML"
)
IF
(
WITH_LIBXSMM
)
TARGET_LINK_LIBRARIES
(
cblas
${
LIBXSMM_LIBS
}
)
ADD_DEPENDENCIES
(
cblas extern_libxsmm
)
ENDIF
()
IF
(
NOT
${
CBLAS_FOUND
}
)
ADD_DEPENDENCIES
(
cblas extern_openblas
)
LIST
(
APPEND external_project_dependencies cblas
)
...
...
cmake/external/python.cmake
浏览文件 @
2a672d24
...
...
@@ -18,8 +18,9 @@ ENDIF()
INCLUDE
(
python_module
)
FIND_PACKAGE
(
PythonInterp 2.7
)
FIND_PACKAGE
(
PythonLibs 2.7
)
FIND_PACKAGE
(
PythonInterp
${
PY_VERSION
}
)
FIND_PACKAGE
(
PythonLibs
${
PY_VERSION
}
)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY
(
python SHARED IMPORTED GLOBAL
)
SET_PROPERTY
(
TARGET python PROPERTY IMPORTED_LOCATION
${
PYTHON_LIBRARIES
}
)
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.cc
浏览文件 @
2a672d24
...
...
@@ -276,6 +276,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
}
bool
use_gpu
=
false
;
#ifdef PADDLE_WITH_CUDA
use_gpu
=
nccl_ctxs_
!=
nullptr
;
#endif
if
(
use_gpu
||
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
// Insert BCast Ops
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set
[
dev_id
];
...
...
@@ -283,6 +290,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
CreateBroadcastOp
(
&
result
,
bcast_name
,
dev_id
);
}
}
}
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
...
...
@@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
if
(
strategy_
.
reduce_
!=
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
-
1
;
}
int
op_role
=
boost
::
get
<
int
>
(
op
.
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
()));
if
(
op_role
!=
static_cast
<
int
>
(
framework
::
OpRole
::
kOptimize
))
{
return
-
1
;
}
auto
param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
op
.
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
for
(
auto
&
varname
:
op
.
InputArgumentNames
())
{
int
dev_id
=
GetVarDeviceID
(
varname
);
if
(
dev_id
!=
-
1
)
{
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
]);
PADDLE_ENFORCE_NE
(
dev_id
,
-
1
,
"dev_id should not be -1.[%s, %s]"
,
op
.
Type
(),
param_grad
[
0
]);
return
dev_id
;
}
}
return
-
1
;
}
int
MultiDevSSAGraphBuilder
::
GetVarDeviceID
(
const
std
::
string
&
varname
)
const
{
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
2a672d24
...
...
@@ -45,6 +45,7 @@ class ParallelExecutorPrivate {
#endif
bool
own_local_scope_
;
bool
use_cuda_
;
bool
use_all_reduce_
;
};
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
...
...
@@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor(
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
use_all_reduce_
=
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
if
(
!
member_
->
use_all_reduce_
)
{
PADDLE_ENFORCE
(
places
.
size
()
>
1
,
"If you set build_strategy.reduce with 'Reduce',"
"the number of places must be greater than 1."
);
}
// Step 1. Bcast the params to devs.
// Create local scopes
...
...
@@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
}
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevs
(
bcast_vars
);
BCastParamsToDev
ice
s
(
bcast_vars
);
}
// Startup Program has been run. All local scopes has correct parameters.
...
...
@@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA
builder_factory
.
SetNCCLContextMap
(
member_
->
nccl_ctxs_
.
get
());
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
PADDLE_THROW
(
"Not compiled with CUDA
.
"
);
#endif
}
...
...
@@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor(
member_
->
places_
,
std
::
move
(
member_
->
executor_
)));
}
void
ParallelExecutor
::
BCastParamsToDevs
(
void
ParallelExecutor
::
BCastParamsToDev
ice
s
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
{
// the
the
initializing bcast, all vars would be bcast from device(0),
// the initializing bcast, all vars would be bcast from device(0),
// otherwise
// bcast from the specified device.
bool
initializing
=
builder_
.
get
()
==
nullptr
?
true
:
false
;
...
...
@@ -209,9 +218,16 @@ void ParallelExecutor::BCastParamsToDevs(
auto
local_scope
=
member_
->
local_scopes_
[
i
];
auto
*
t
=
local_scope
->
Var
(
var
)
->
GetMutable
<
LoDTensor
>
();
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
if
(
member_
->
use_all_reduce_
||
member_
->
use_cuda_
||
var
==
"@LR_DECAY_COUNTER@"
)
{
t
->
Resize
(
dims
);
t
->
mutable_data
(
cpu
,
main_tensor
.
type
());
paddle
::
framework
::
TensorCopy
(
main_tensor
,
cpu
,
t
);
}
else
{
t
->
ShareDataWith
(
main_tensor
);
}
}
}
}
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
2a672d24
...
...
@@ -66,7 +66,7 @@ class ParallelExecutor {
void
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
);
void
BCastParamsToDevs
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
void
BCastParamsToDev
ice
s
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
private:
ParallelExecutorPrivate
*
member_
;
...
...
paddle/fluid/operators/checkpoint_notify_op.cc
浏览文件 @
2a672d24
...
...
@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
2a672d24
...
...
@@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req_count_
++
;
}
void
GRPCClient
::
Wait
()
{
bool
GRPCClient
::
Wait
()
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
sync_mutex_
);
sync_cond_
.
wait
(
lk
,
[
this
]
{
return
req_count_
==
0
;
});
sync_cond_
.
wait
(
lk
,
[
this
]
{
return
(
req_count_
==
0
||
ok_
==
false
);
});
return
ok_
;
}
void
GRPCClient
::
Proceed
()
{
...
...
@@ -297,6 +298,14 @@ void GRPCClient::Proceed() {
if
(
c
->
status_
.
ok
())
{
VLOG
(
3
)
<<
c
->
var_h_
.
String
()
<<
" process"
;
c
->
Process
();
}
else
if
(
c
->
status_
.
error_code
()
==
grpc
::
StatusCode
::
DEADLINE_EXCEEDED
)
{
LOG
(
ERROR
)
<<
c
->
var_h_
.
String
()
<<
" meets grpc error:"
<<
c
->
status_
.
error_message
();
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
sync_mutex_
);
ok_
=
false
;
}
sync_cond_
.
notify_all
();
}
else
{
LOG
(
FATAL
)
<<
c
->
var_h_
.
String
()
<<
" meets grpc error:"
<<
c
->
status_
.
error_message
();
...
...
paddle/fluid/operators/distributed/grpc_client.h
浏览文件 @
2a672d24
...
...
@@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class
GRPCClient
:
public
RPCClient
{
public:
GRPCClient
()
{}
GRPCClient
()
:
ok_
(
true
)
{}
virtual
~
GRPCClient
();
bool
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
...
...
@@ -221,7 +221,7 @@ class GRPCClient : public RPCClient {
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
Wait
()
override
;
bool
Wait
()
override
;
void
SendBeginPass
()
override
;
...
...
@@ -247,6 +247,7 @@ class GRPCClient : public RPCClient {
std
::
mutex
sync_mutex_
;
std
::
condition_variable
sync_cond_
;
std
::
atomic
<
int64_t
>
req_count_
{
0
};
bool
ok_
;
// mutex for GetChannel thread safety
std
::
mutex
chan_mutex_
;
...
...
paddle/fluid/operators/distributed/rpc_client.h
浏览文件 @
2a672d24
...
...
@@ -72,7 +72,7 @@ class RPCClient {
virtual
void
SendBeginPass
()
=
0
;
virtual
void
SendEndPass
()
=
0
;
virtual
void
Wait
()
=
0
;
virtual
bool
Wait
()
=
0
;
template
<
typename
T
>
static
RPCClient
*
GetInstance
()
{
...
...
paddle/fluid/operators/fetch_barrier_op.cc
浏览文件 @
2a672d24
...
...
@@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase {
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"fetch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendFetchBarrier
(
ep
);
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/math/blas.h
浏览文件 @
2a672d24
...
...
@@ -21,6 +21,10 @@
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_WITH_LIBXSMM
#include <libxsmm.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
2a672d24
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <limits>
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
...
...
@@ -30,6 +31,12 @@ struct CBlas<float> {
platform
::
dynload
::
cblas_sgemm
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
libxsmm_sgemm
(
args
...);
}
#endif
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_saxpy
(
args
...);
...
...
@@ -63,6 +70,12 @@ struct CBlas<double> {
platform
::
dynload
::
cblas_dgemm
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
libxsmm_dgemm
(
args
...);
}
#endif
template
<
typename
...
ARGS
>
static
void
AXPY
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_daxpy
(
args
...);
...
...
@@ -140,6 +153,9 @@ struct CBlas<double> {
template
<
>
struct
CBlas
<
platform
::
float16
>
{
static
void
GEMM
(...)
{
PADDLE_THROW
(
"float16 GEMM not supported on CPU"
);
}
static
void
SMM_GEMM
(...)
{
PADDLE_THROW
(
"float16 SMM_GEMM not supported on CPU"
);
}
#ifdef PADDLE_WITH_MKLML
static
void
GEMM_BATCH
(...)
{
PADDLE_THROW
(
"float16 GEMM_BATCH not supported on CPU"
);
...
...
@@ -147,6 +163,33 @@ struct CBlas<platform::float16> {
#endif
};
template
<
typename
T
>
inline
bool
UseXSMM
(
const
int
&
m
,
const
int
&
n
,
const
int
&
k
,
bool
transa
,
bool
transb
,
const
T
&
alpha
,
const
T
&
beta
)
{
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom
constexpr
int
LIBXSMM_THRESHOLD
=
20
*
20
*
20
;
if
(
m
*
n
*
k
>
LIBXSMM_THRESHOLD
||
transa
||
transb
||
std
::
abs
<
T
>
(
alpha
-
static_cast
<
T
>
(
1
)
>
std
::
numeric_limits
<
T
>::
epsilon
())
||
std
::
abs
<
T
>
(
beta
)
>
std
::
numeric_limits
<
T
>::
epsilon
())
{
return
false
;
}
else
{
return
true
;
}
#endif
return
false
;
}
template
<
>
inline
bool
UseXSMM
<
platform
::
float16
>
(
const
int
&
m
,
const
int
&
n
,
const
int
&
k
,
bool
transa
,
bool
transb
,
const
platform
::
float16
&
alpha
,
const
platform
::
float16
&
beta
)
{
return
false
;
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -156,8 +199,21 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
CBlas
<
T
>::
GEMM
(
CblasRowMajor
,
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
#ifdef PADDLE_WITH_LIBXSMM
if
(
UseXSMM
(
M
,
N
,
K
,
transA
!=
CblasNoTrans
,
transB
!=
CblasNoTrans
,
alpha
,
beta
))
{
// Note: SMM use ColMajor
const
char
transa
=
'N'
;
const
char
transb
=
'N'
;
CBlas
<
T
>::
SMM_GEMM
(
&
transa
,
&
transb
,
&
N
,
&
M
,
&
K
,
&
alpha
,
B
,
&
ldb
,
A
,
&
lda
,
&
beta
,
C
,
&
ldc
);
}
else
{
#endif
CBlas
<
T
>::
GEMM
(
CblasRowMajor
,
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
#ifdef PADDLE_WITH_LIBXSMM
}
#endif
}
template
<
>
...
...
paddle/fluid/operators/math/math_function_test.cc
浏览文件 @
2a672d24
...
...
@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) {
EXPECT_EQ
(
input3_ptr
[
6
],
86
);
EXPECT_EQ
(
input3_ptr
[
7
],
99
);
}
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
T
>
void
MklSmmCompare
(
int
m
,
int
n
,
int
k
)
{
paddle
::
framework
::
Tensor
mat_a
;
paddle
::
framework
::
Tensor
mat_b
;
paddle
::
framework
::
Tensor
mat_c_smm
;
paddle
::
framework
::
Tensor
mat_c_mkl
;
auto
*
cpu_place
=
new
paddle
::
platform
::
CPUPlace
();
T
*
A
=
mat_a
.
mutable_data
<
T
>
({
m
,
k
},
*
cpu_place
);
T
*
B
=
mat_b
.
mutable_data
<
T
>
({
k
,
n
},
*
cpu_place
);
T
*
CSMM
=
mat_c_smm
.
mutable_data
<
T
>
({
m
,
n
},
*
cpu_place
);
T
*
CMKL
=
mat_c_mkl
.
mutable_data
<
T
>
({
m
,
n
},
*
cpu_place
);
T
alpha
=
static_cast
<
T
>
(
1
);
T
beta
=
static_cast
<
T
>
(
0
);
for
(
int
i
=
0
;
i
<
mat_a
.
numel
();
++
i
)
{
A
[
i
]
=
static_cast
<
T
>
(
i
);
}
for
(
int
i
=
0
;
i
<
mat_b
.
numel
();
++
i
)
{
B
[
i
]
=
static_cast
<
T
>
(
i
);
}
// lda,ldb,ldc follow RowMajor
int
lda
=
k
;
int
ldb
=
n
;
int
ldc
=
n
;
auto
smm
=
[
&
,
m
,
n
,
k
,
lda
,
ldb
,
ldc
,
alpha
,
beta
]()
{
const
char
transa
=
'N'
;
const
char
transb
=
'N'
;
paddle
::
operators
::
math
::
CBlas
<
T
>::
SMM_GEMM
(
&
transa
,
&
transb
,
&
n
,
&
m
,
&
k
,
&
alpha
,
B
,
&
ldb
,
A
,
&
lda
,
&
beta
,
CSMM
,
&
ldc
);
};
auto
mkl
=
[
&
,
m
,
n
,
k
,
lda
,
ldb
,
ldc
,
alpha
,
beta
]()
{
paddle
::
operators
::
math
::
CBlas
<
T
>::
GEMM
(
CblasRowMajor
,
CblasNoTrans
,
CblasNoTrans
,
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
CMKL
,
ldc
);
};
smm
();
mkl
();
ASSERT_EQ
(
mat_c_mkl
.
numel
(),
mat_c_smm
.
numel
());
for
(
int
i
=
0
;
i
<
mat_c_mkl
.
numel
();
++
i
)
{
EXPECT_FLOAT_EQ
(
CSMM
[
i
],
CMKL
[
i
]);
}
}
TEST
(
math_function
,
gemm_mkl_vs_smm
)
{
MklSmmCompare
<
float
>
(
1
,
2
,
3
);
MklSmmCompare
<
double
>
(
1
,
2
,
3
);
MklSmmCompare
<
float
>
(
3
,
2
,
1
);
MklSmmCompare
<
double
>
(
3
,
2
,
1
);
MklSmmCompare
<
float
>
(
3
,
8
,
5
);
MklSmmCompare
<
double
>
(
3
,
8
,
5
);
}
#endif
TEST
(
math_function
,
gemm_trans_c
lb
as
)
{
TEST
(
math_function
,
gemm_trans_c
bl
as
)
{
paddle
::
framework
::
Tensor
input1
;
paddle
::
framework
::
Tensor
input2
;
paddle
::
framework
::
Tensor
input3
;
...
...
paddle/fluid/operators/prefetch_op.cc
浏览文件 @
2a672d24
...
...
@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
};
...
...
paddle/fluid/operators/recv_op.cc
浏览文件 @
2a672d24
...
...
@@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase {
rpc_client
->
AsyncGetVar
(
epmap
[
i
],
ctx
,
scope
,
outs
[
i
]);
}
if
(
sync_mode
)
{
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/operators/send_barrier_op.cc
浏览文件 @
2a672d24
...
...
@@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase {
VLOG
(
3
)
<<
"SendBarrierOp sync_mode:"
<<
sync_mode
;
// need to wait before sending send_barrier message
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
if
(
sync_mode
)
{
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"send barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/operators/send_op.cc
浏览文件 @
2a672d24
...
...
@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase {
}
}
if
(
sync_send
)
{
rpc_client
->
Wait
(
);
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
}
};
...
...
paddle/fluid/operators/top_k_op.h
浏览文件 @
2a672d24
...
...
@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> {
#endif
for
(
size_t
i
=
0
;
i
<
row
;
i
++
)
{
std
::
vector
<
std
::
pair
<
T
,
size_t
>>
vec
;
vec
.
reserve
(
col
);
for
(
size_t
j
=
0
;
j
<
col
;
j
++
)
{
vec
.
push_back
(
std
::
pair
<
T
,
size_t
>
(
eg_input
(
i
,
j
),
j
));
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
2a672d24
...
...
@@ -68,7 +68,7 @@ bool IsCompiledWithCUDA() {
}
bool
IsCompiledWithDIST
()
{
#ifdef PADDLE_WITH_DIST
#ifdef PADDLE_WITH_DIST
RIBUTE
return
true
;
#else
return
false
;
...
...
@@ -669,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle.
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
size_t
,
size_t
>
())
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToDevs
)
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToDev
ice
s
)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
paddle/legacy/utils/PythonUtil.cpp
浏览文件 @
2a672d24
...
...
@@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName,
const
std
::
string
&
funcName
,
const
std
::
vector
<
std
::
string
>&
args
)
{
PyObjectPtr
obj
=
callPythonFuncRetPyObj
(
moduleName
,
funcName
,
args
);
#if PY_MAJOR_VERSION >= 3
Py_ssize_t
str_size
=
0u
;
const
char
*
str
=
PyUnicode_AsUTF8AndSize
(
obj
.
get
(),
&
str_size
);
return
std
::
string
(
str
,
(
size_t
)
str_size
);
#else
return
std
::
string
(
PyString_AsString
(
obj
.
get
()),
PyString_Size
(
obj
.
get
()));
#endif // PY_MAJOR_VERSION >= 3
}
PyObjectPtr
createPythonClass
(
...
...
paddle/legacy/utils/PythonUtil.h
浏览文件 @
2a672d24
...
...
@@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
namespace
py
{
PyObjectPtr
import
(
const
std
::
string
&
moduleName
);
#if PY_MAJOR_VERSION >= 3
/**
* Cast a PyLong to int type T.
* @tparam T return type.
* @param [in] obj PyLong object.
* @param [out] ok status for casting. False if error occured. nullptr if user
* don't care is ok or not.
* @return The value of python object, or 0 if not ok.
*/
template
<
typename
T
>
T
castInt
(
PyObject
*
obj
,
bool
*
ok
=
nullptr
)
{
// Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
// were unified to long since python3
if
(
PyLong_Check
(
obj
))
{
if
(
ok
)
*
ok
=
true
;
return
(
T
)
PyLong_AsUnsignedLong
(
obj
);
}
else
{
if
(
ok
)
*
ok
=
false
;
return
(
T
)
0
;
}
}
// Convert PyAPI from 2.x to 3.x
#define PyString_FromString PyUnicode_FromString
#define PyString_AsString PyUnicode_AsUTF8
#else
/**
* Cast a PyLong or PyInt to int type T.
* @tparam T return type.
...
...
@@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
return
(
T
)
0
;
}
}
#endif // PY_MAJOR_VERSION >= 3
/**
* Invoke repr of python object.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
2a672d24
...
...
@@ -78,6 +78,12 @@ function cmake_gen() {
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
elif
[
"
$1
"
==
"cp35-cp35m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.5.1/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.5.1/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
fi
fi
...
...
@@ -108,6 +114,7 @@ function cmake_gen() {
-DWITH_CONTRIB=
${
WITH_CONTRIB
:-
ON
}
-DWITH_ANAKIN=
${
WITH_ANAKIN
:-
OFF
}
-DWITH_INFERENCE_DEMO=
${
WITH_INFERENCE_DEMO
:-
ON
}
-DPY_VERSION=
${
PY_VERSION
:-
2
.7
}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
...
...
@@ -136,7 +143,8 @@ EOF
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
\
-DWITH_CONTRIB
=
${
WITH_CONTRIB
:-
ON
}
\
-DWITH_ANAKIN
=
${
WITH_ANAKIN
:-
OFF
}
\
-DWITH_INFERENCE_DEMO
=
${
WITH_INFERENCE_DEMO
:-
ON
}
-DWITH_INFERENCE_DEMO
=
${
WITH_INFERENCE_DEMO
:-
ON
}
\
-DPY_VERSION
=
${
PY_VERSION
:-
2
.7
}
}
function
abort
(){
...
...
python/paddle/__init__.py
浏览文件 @
2a672d24
...
...
@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
try
:
from
version
import
full_version
as
__version__
from
version
import
commit
as
__git_commit__
from
paddle.
version
import
full_version
as
__version__
from
paddle.
version
import
commit
as
__git_commit__
except
ImportError
:
import
sys
...
...
@@ -21,7 +21,7 @@ except ImportError:
import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
)
import
reader
import
dataset
import
batch
import
paddle.
reader
import
paddle.
dataset
import
paddle.
batch
batch
=
batch
.
batch
python/paddle/dataset/__init__.py
浏览文件 @
2a672d24
...
...
@@ -15,20 +15,20 @@
Dataset package.
"""
import
mnist
import
imikolov
import
imdb
import
cifar
import
movielens
import
conll05
import
uci_housing
import
sentiment
import
wmt14
import
wmt16
import
mq2007
import
flowers
import
voc2012
import
image
import
paddle.dataset.
mnist
import
paddle.dataset.
imikolov
import
paddle.dataset.
imdb
import
paddle.dataset.
cifar
import
paddle.dataset.
movielens
import
paddle.dataset.
conll05
import
paddle.dataset.
uci_housing
import
paddle.dataset.
sentiment
import
paddle.dataset.
wmt14
import
paddle.dataset.
wmt16
import
paddle.dataset.
mq2007
import
paddle.dataset.
flowers
import
paddle.dataset.
voc2012
import
paddle.dataset.
image
__all__
=
[
'mnist'
,
...
...
python/paddle/fluid/clip.py
浏览文件 @
2a672d24
...
...
@@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None):
param
.
gradient_clip_attr
=
copy
.
deepcopy
(
clip
)
def
append_gradient_clip_ops
(
param_grad
):
def
append_gradient_clip_ops
(
param_grad
s
):
context
=
dict
()
for
p
,
g
in
param_grad
:
with
p
.
block
.
program
.
optimized_guard
(
p
):
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
optimized_guard
([
p
,
g
]):
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
NullGradientClipAttr
())
if
clip_attr
is
None
:
clip_attr
=
NullGradientClipAttr
()
...
...
@@ -339,8 +341,10 @@ def append_gradient_clip_ops(param_grad):
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
for
p
,
g
in
param_grad
:
with
p
.
block
.
program
.
optimized_guard
(
p
):
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
optimized_guard
([
p
,
g
]):
res
.
append
(
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
))
return
res
...
...
python/paddle/fluid/framework.py
浏览文件 @
2a672d24
...
...
@@ -1319,7 +1319,7 @@ class Program(object):
self
.
_op_role_var
=
[
var_name
]
@
contextlib
.
contextmanager
def
optimized_guard
(
self
,
var
):
def
optimized_guard
(
self
,
param_and_grads
):
"""
A with guard to set :code:`Optimization` :code:`OpRole` and
:code:`OpRoleVar` automatically.
...
...
@@ -1327,17 +1327,20 @@ class Program(object):
Notes: This is a very low level API. Users should not use it directly.
Args:
var(Variable|str): The variable (name
) to be optimized.
param_and_grads(list): The variables (names
) to be optimized.
Examples:
>>> p, g = backward(...)
>>> with program.optimized_guard(
p
):
>>> with program.optimized_guard(
[p,g]
):
>>> p = p - 0.001 * g
"""
OpRole
=
core
.
op_proto_and_checker_maker
.
OpRole
self
.
_current_role
=
OpRole
.
Optimize
self
.
_op_role_var
=
[
var
.
name
if
isinstance
(
var
,
Variable
)
else
var
]
self
.
_op_role_var
=
[
var
.
name
if
isinstance
(
var
,
Variable
)
else
var
for
var
in
param_and_grads
]
yield
self
.
_op_role_var
=
[]
self
.
_current_role
=
OpRole
.
Forward
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
2a672d24
...
...
@@ -123,7 +123,7 @@ class Optimizer(object):
"""
pass
def
_finish_update
(
self
,
block
,
parameters
):
def
_finish_update
(
self
,
block
,
parameters
_and_grads
):
"""Finish any custom updates needed
before completing an optimization step
...
...
@@ -226,18 +226,18 @@ class Optimizer(object):
optimize_ops
=
[]
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
with
param_and_grad
[
0
].
block
.
program
.
optimized_guard
(
param_and_grad
[
0
]):
if
param_and_grad
[
0
].
trainable
is
True
and
param_and_grad
[
1
]
is
not
None
:
param_and_grad
):
if
param_and_grad
[
0
].
trainable
is
True
:
optimize_op
=
self
.
_append_optimize_op
(
loss
.
block
,
param_and_grad
)
optimize_ops
.
append
(
optimize_op
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
self
.
_finish_update
(
loss
.
block
,
[
p
[
0
]
for
p
in
parameters_and_grads
])
self
.
_finish_update
(
loss
.
block
,
parameters_and_grads
)
end
=
len
(
global_block
.
ops
)
return
global_block
.
slice_ops
(
start
,
end
)
...
...
@@ -564,13 +564,15 @@ class AdamOptimizer(Optimizer):
return
adam_op
def
_finish_update
(
self
,
block
,
param
eter
s
):
def
_finish_update
(
self
,
block
,
param
_and_grad
s
):
"""Update Beta1 and Beta2 Power accumulators
"""
assert
isinstance
(
block
,
framework
.
Block
)
main_block
=
block
.
program
.
global_block
()
for
param
in
parameters
:
with
param
.
block
.
program
.
optimized_guard
(
param
):
for
param
,
grad
in
param_and_grads
:
if
grad
is
None
:
continue
with
param
.
block
.
program
.
optimized_guard
([
param
,
grad
]):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
beta2_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta2_pow_acc_str
,
...
...
@@ -691,13 +693,15 @@ class AdamaxOptimizer(Optimizer):
return
adamax_op
def
_finish_update
(
self
,
block
,
parameters
):
def
_finish_update
(
self
,
block
,
parameters
_and_grads
):
"""Update Beta1 Power accumulator
"""
assert
isinstance
(
block
,
framework
.
Block
)
main_block
=
block
.
program
.
global_block
()
for
param
in
parameters
:
with
param
.
block
.
program
.
optimized_guard
(
param
):
for
param
,
grad
in
parameters_and_grads
:
if
grad
is
None
:
continue
with
param
.
block
.
program
.
optimized_guard
([
param
,
grad
]):
beta1_pow_acc
=
self
.
_get_accumulator
(
self
.
_beta1_pow_acc_str
,
param
)
main_block
.
append_op
(
...
...
@@ -1158,7 +1162,9 @@ class ModelAverage(Optimizer):
self
.
params_grads
.
append
((
param
,
grad
))
for
param
,
grad
in
self
.
params_grads
:
with
param
.
block
.
program
.
optimized_guard
(
param
):
if
grad
is
None
:
continue
with
param
.
block
.
program
.
optimized_guard
([
param
,
grad
]):
self
.
_append_average_accumulate_op
(
param
)
self
.
apply_program
=
Program
()
...
...
python/paddle/fluid/regularizer.py
浏览文件 @
2a672d24
...
...
@@ -41,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
"""
params_and_grads
=
[]
for
param
,
grad
in
parameters_and_grads
:
with
param
.
block
.
program
.
optimized_guard
(
param
):
# If no gradient then we don't need to do anything
if
grad
is
None
:
params_and_grads
.
append
((
param
,
grad
))
continue
with
param
.
block
.
program
.
optimized_guard
([
param
,
grad
]):
regularization_term
=
None
if
param
.
regularizer
is
not
None
:
# Add variable for regularization term in grad block
...
...
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
2a672d24
...
...
@@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_dict
=
None
,
seed
=
None
,
use_parallel_executor
=
True
,
balance_parameter_opt_between_cards
=
False
):
use_reduce
=
False
,
optimizer
=
fluid
.
optimizer
.
Adam
):
def
run_executor
(
exe
,
feed
,
fetch_list
,
program
=
None
):
if
isinstance
(
exe
,
fluid
.
ParallelExecutor
):
res
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
feed
)
...
...
@@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase):
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
.
random_seed
=
1
# Fix random seed
main
.
random_seed
=
1
with
fluid
.
program_guard
(
main
,
startup
):
if
seed
is
not
None
:
startup
.
random_seed
=
seed
main
.
random_seed
=
seed
loss
=
method
(
use_feed
=
feed_dict
is
not
None
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
optimizer
().
minimize
(
loss
)
if
memory_opt
:
fluid
.
memory_optimize
(
main
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
startup
)
...
...
@@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy
.
allow_op_delay
=
allow_op_delay
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
if
balance_parameter_opt_between_cards
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
\
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
if
use_parallel_executor
:
exe
=
fluid
.
ParallelExecutor
(
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
2a672d24
...
...
@@ -101,9 +101,7 @@ class TestMNIST(TestParallelExecutorBase):
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
MNIST_RECORDIO_FILE
,
reader
,
feeder
)
def
check_simple_fc_convergence
(
self
,
balance_parameter_opt_between_cards
,
use_cuda
=
True
):
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
)
...
...
@@ -115,20 +113,19 @@ class TestMNIST(TestParallelExecutorBase):
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
use_reduce
=
use_reduce
)
def
test_simple_fc
(
self
):
self
.
check_simple_fc_convergence
(
False
,
use_cuda
=
True
)
self
.
check_simple_fc_convergence
(
False
,
use_cuda
=
False
)
# use_cuda
self
.
check_simple_fc_convergence
(
True
)
self
.
check_simple_fc_convergence
(
False
)
def
test_simple_fc_with_new_strategy
(
self
):
self
.
check_simple_fc_convergence
(
True
,
use_cuda
=
True
)
self
.
check_simple_fc_convergence
(
True
,
use_cuda
=
False
)
# use_cuda, use_reduce
self
.
check_simple_fc_convergence
(
True
,
True
)
self
.
check_simple_fc_convergence
(
False
,
True
)
def
check_simple_fc_parallel_accuracy
(
self
,
balance_parameter_opt_between_cards
,
use_cuda
=
True
):
def
check_simple_fc_parallel_accuracy
(
self
,
use_cuda
,
use_reduce
=
False
):
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
single_first_loss
,
single_last_loss
=
self
.
check_network_convergence
(
...
...
@@ -145,8 +142,7 @@ class TestMNIST(TestParallelExecutorBase):
"label"
:
label
},
use_cuda
=
use_cuda
,
use_parallel_executor
=
True
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
use_reduce
=
use_reduce
)
for
p_f
in
parallel_first_loss
:
self
.
assertAlmostEquals
(
p_f
,
single_first_loss
[
0
],
delta
=
1e-6
)
...
...
@@ -154,15 +150,15 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
p_l
,
single_last_loss
[
0
],
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
False
,
use_cuda
=
True
)
self
.
check_simple_fc_parallel_accuracy
(
False
,
use_cuda
=
False
)
self
.
check_simple_fc_parallel_accuracy
(
True
)
self
.
check_simple_fc_parallel_accuracy
(
False
)
def
test_simple_fc_parallel_accuracy_with_new_strategy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
True
,
use_cuda
=
True
)
self
.
check_simple_fc_parallel_accuracy
(
True
,
use_cuda
=
False
)
# use_cuda, use_reduce
self
.
check_simple_fc_parallel_accuracy
(
True
,
True
)
self
.
check_simple_fc_parallel_accuracy
(
False
,
True
)
def
check_batchnorm_fc_convergence
(
self
,
balance_parameter_opt_between_cards
,
use_cuda
):
def
check_batchnorm_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
self
.
check_network_convergence
(
fc_with_batchnorm
,
use_cuda
=
use_cuda
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
...
...
@@ -171,16 +167,16 @@ class TestMNIST(TestParallelExecutorBase):
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
use_reduce
=
use_reduce
)
def
test_batchnorm_fc
(
self
):
self
.
check_batchnorm_fc_convergence
(
False
,
use_cuda
=
True
)
self
.
check_batchnorm_fc_convergence
(
False
,
use_cuda
=
False
)
self
.
check_batchnorm_fc_convergence
(
True
)
self
.
check_batchnorm_fc_convergence
(
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
check_batchnorm_fc_convergence
(
True
,
use_cuda
=
True
)
self
.
check_batchnorm_fc_convergence
(
True
,
use_cuda
=
False
)
# use_cuda, use_reduce
self
.
check_batchnorm_fc_convergence
(
True
,
True
)
self
.
check_batchnorm_fc_convergence
(
False
,
True
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
浏览文件 @
2a672d24
...
...
@@ -13,8 +13,12 @@
# limitations under the License.
import
paddle.fluid
as
fluid
import
paddle.fluid.layers.ops
as
ops
from
paddle.fluid.initializer
import
init_on_cpu
from
paddle.fluid.layers.learning_rate_scheduler
import
_decay_step_counter
from
parallel_executor_test_base
import
TestParallelExecutorBase
import
unittest
import
math
import
os
...
...
@@ -131,30 +135,71 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
class
TestResnet
(
TestParallelExecutorBase
):
def
check_resnet_convergence
(
self
,
balance_parameter_opt_between_cards
,
def
check_resnet_convergence_with_learning_rate_decay
(
self
,
use_cuda
=
True
,
use_reduce
=
False
,
iter
=
20
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_cosine_decay
(
learning_rate
,
step_each_epoch
,
epochs
=
120
):
"""
Applies cosine decay to the learning rate.
lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
"""
global_step
=
_decay_step_counter
()
with
init_on_cpu
():
epoch
=
ops
.
floor
(
global_step
/
step_each_epoch
)
decayed_lr
=
learning_rate
*
\
(
ops
.
cos
(
epoch
*
(
math
.
pi
/
epochs
))
+
1
)
/
2
return
decayed_lr
def
_optimizer
(
learning_rate
=
0.01
):
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
_cosine_decay
(
learning_rate
=
learning_rate
,
step_each_epoch
=
2
,
epochs
=
1
),
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
return
optimizer
import
functools
batch_size
=
2
self
.
check_network_convergence
(
single_first_loss
,
single_last_loss
=
self
.
check_network_convergence
(
functools
.
partial
(
SE_ResNeXt50Small
,
batch_size
=
batch_size
),
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
def
test_resnet
(
self
):
self
.
check_resnet_convergence
(
False
,
use_cuda
=
True
)
self
.
check_resnet_convergence
(
False
,
use_cuda
=
False
,
iter
=
5
)
use_reduce
=
use_reduce
,
optimizer
=
_optimizer
,
use_parallel_executor
=
False
)
def
test_resnet_with_new_strategy
(
self
):
self
.
check_resnet_convergence
(
True
,
use_cuda
=
True
)
self
.
check_resnet_convergence
(
True
,
use_cuda
=
False
,
iter
=
5
)
parallel_first_loss
,
parallel_last_loss
=
self
.
check_network_convergence
(
functools
.
partial
(
SE_ResNeXt50Small
,
batch_size
=
batch_size
),
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_reduce
=
use_reduce
,
optimizer
=
_optimizer
)
for
p_f
in
parallel_first_loss
:
self
.
assertAlmostEquals
(
p_f
,
single_first_loss
[
0
],
delta
=
1e-6
)
for
p_l
in
parallel_last_loss
:
self
.
assertAlmostEquals
(
p_l
,
single_last_loss
[
0
],
delta
=
1e-6
)
def
test_seresnext_with_learning_rate_decay
(
self
):
self
.
check_resnet_convergence_with_learning_rate_decay
(
True
,
False
)
self
.
check_resnet_convergence_with_learning_rate_decay
(
False
,
False
,
iter
=
5
)
def
test_seresnext_with_new_strategy_with_learning_rate_decay
(
self
):
self
.
check_resnet_convergence_with_learning_rate_decay
(
True
,
True
)
self
.
check_resnet_convergence_with_learning_rate_decay
(
False
,
True
,
iter
=
5
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
2a672d24
...
...
@@ -31,6 +31,7 @@ Steps to transpile pserver:
from
__future__
import
print_function
import
math
import
random
import
numpy
as
np
from
ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
...
...
@@ -197,7 +198,8 @@ class DistributeTranspiler(object):
# shuffle the map will avoid the uneven distribution above
grad_var_mapping_items
=
self
.
grad_var_mapping
.
items
()
if
not
slice_var_up
:
np
.
random
.
shuffle
(
grad_var_mapping_items
)
random
.
seed
(
self
.
trainer_num
)
random
.
shuffle
(
grad_var_mapping_items
)
for
orig_varname
,
splited_vars
in
grad_var_mapping_items
:
eplist
=
ps_dispatcher
.
dispatch
(
splited_vars
)
...
...
python/paddle/reader/__init__.py
浏览文件 @
2a672d24
...
...
@@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator:
TODO(yuyang18): Should we add whole design doc here?
"""
import
decorator
from
decorator
import
*
import
paddle.reader.
decorator
from
paddle.reader.
decorator
import
*
import
creator
import
paddle.reader.
creator
__all__
=
decorator
.
__all__
+
[
'creator'
]
python/paddle/reader/decorator.py
浏览文件 @
2a672d24
...
...
@@ -20,7 +20,7 @@ __all__ = [
from
threading
import
Thread
import
subprocess
from
Q
ueue
import
Queue
from
six.moves.q
ueue
import
Queue
import
itertools
import
random
import
zlib
...
...
python/requirements.txt
浏览文件 @
2a672d24
...
...
@@ -8,4 +8,4 @@ scipy>=0.19.0
Pillow
nltk>=3.2.2
graphviz
LinkChecker
six
python/setup.py.in
浏览文件 @
2a672d24
...
...
@@ -17,7 +17,8 @@ def git_commit():
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
except:
git_commit = 'Unknown'
return git_commit
git_commit = git_commit.decode()
return str(git_commit)
def _get_version_detail(idx):
assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
...
...
@@ -44,6 +45,7 @@ def is_taged():
try:
cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_tag = git_tag.decode()
except:
return False
...
...
@@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s'
def show():
if istaged:
print
'full_version:', full_version
print
'major:', major
print
'minor:', minor
print
'patch:', patch
print
'rc:', rc
print
('full_version:', full_version)
print
('major:', major)
print
('minor:', minor)
print
('patch:', patch)
print
('rc:', rc)
else:
print
'commit:', commit
print
('commit:', commit)
def mkl():
return with_mkl
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录