Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
6250be4b
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6250be4b
编写于
11月 26, 2018
作者:
P
peizhilin
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'windows/build' into windows/online
test=develop
上级
e0d47cc9
30849d1f
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
773 addition
and
65 deletion
+773
-65
cmake/operators.cmake
cmake/operators.cmake
+2
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-1
paddle/fluid/framework/transfer_scope_cache.cc
paddle/fluid/framework/transfer_scope_cache.cc
+14
-24
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+1
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+5
-1
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
...uid/operators/fused/fusion_transpose_flatten_concat_op.cc
+114
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
.../operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+115
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
...luid/operators/fused/fusion_transpose_flatten_concat_op.h
+50
-0
paddle/fluid/operators/lookup_sparse_table_op.cc
paddle/fluid/operators/lookup_sparse_table_op.cc
+1
-0
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+3
-0
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+5
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+9
-5
python/paddle/fluid/contrib/utils/__init__.py
python/paddle/fluid/contrib/utils/__init__.py
+3
-1
python/paddle/fluid/contrib/utils/lookup_table_utils.py
python/paddle/fluid/contrib/utils/lookup_table_utils.py
+256
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+20
-0
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+40
-11
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+10
-10
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
...ests/unittests/test_fusion_transpose_flatten_concat_op.py
+105
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+19
-10
未找到文件。
cmake/operators.cmake
浏览文件 @
6250be4b
...
...
@@ -109,7 +109,8 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
6250be4b
...
...
@@ -116,7 +116,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library
(
op_info SRCS op_info.cc DEPS attribute framework_proto
)
cc_library
(
shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context
)
cc_library
(
transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto
)
cc_library
(
transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto
device_context
)
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache
)
...
...
paddle/fluid/framework/transfer_scope_cache.cc
浏览文件 @
6250be4b
...
...
@@ -17,16 +17,28 @@
namespace
paddle
{
namespace
framework
{
// Holds all the transfer scope across the process.
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
()
{
thread_local
auto
*
x
=
new
std
::
unordered_map
<
size_t
,
Scope
*>
;
typedef
std
::
unordered_map
<
size_t
,
Scope
*>
map_t
;
thread_local
std
::
unique_ptr
<
map_t
>
x
(
new
map_t
);
return
*
x
;
}
// Holds all the transfer scope for this thread.
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
()
{
thread_local
auto
*
x
=
new
std
::
unordered_set
<
Scope
*>
;
typedef
std
::
unordered_set
<
Scope
*>
set_t
;
thread_local
std
::
unique_ptr
<
set_t
>
x
(
new
set_t
);
return
*
x
;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
)
{
Scope
*
new_scope
{
nullptr
};
...
...
@@ -46,27 +58,5 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
return
new_scope
;
}
void
RemoveKidsFromTransferScopeCache
(
Scope
*
scope
)
{
auto
it
=
global_transfer_scope_cache
().
find
(
scope
);
if
(
it
!=
global_transfer_scope_cache
().
end
())
{
global_transfer_scope_cache
().
erase
(
it
);
}
for
(
auto
*
s
:
scope
->
kids
())
{
auto
it
=
global_transfer_scope_cache
().
find
(
s
);
if
(
it
!=
global_transfer_scope_cache
().
end
())
{
global_transfer_scope_cache
().
erase
(
it
);
}
}
// remove global transfer data cache
auto
&
cache
=
global_transfer_data_cache
();
for
(
auto
it
=
cache
.
begin
();
it
!=
cache
.
end
();)
{
if
(
it
->
second
==
scope
)
it
=
cache
.
erase
(
it
);
else
it
++
;
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
6250be4b
...
...
@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t
thread_num
=
32
;
size_t
sleep_time
=
40
;
size_t
extra_time
=
2
;
size_t
extra_time
=
10
;
// Reserve to perform more tests in the future
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
6250be4b
include
(
operators
)
register_operators
()
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op
)
if
(
WITH_GPU
)
op_library
(
fusion_transpose_flatten_concat_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);
\n
"
)
endif
()
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
0 → 100644
浏览文件 @
6250be4b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
class
TransposeFlattenConcatFusionOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_GE
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of ConcatOp should be empty."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ConcatOp should not be null."
);
auto
ins
=
ctx
->
GetInputsDim
(
"X"
);
const
size_t
n
=
ins
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
std
::
vector
<
int
>
trans_axis
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"concat_axis"
);
size_t
x_rank
=
ins
[
0
].
size
();
size_t
trans_axis_size
=
trans_axis
.
size
();
PADDLE_ENFORCE_EQ
(
x_rank
,
trans_axis_size
,
"The input tensor's rank(%d) "
"should be equal to the permutation axis's size(%d)"
,
x_rank
,
trans_axis_size
);
auto
dims0
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
0
]));
std
::
vector
<
int
>
out_dims
(
dims0
);
for
(
size_t
i
=
1
;
i
<
n
;
i
++
)
{
auto
dimsi
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
i
]));
for
(
int
j
=
0
;
j
<
static_cast
<
int
>
(
dims0
.
size
());
j
++
)
{
if
(
j
==
concat_axis
)
{
out_dims
[
concat_axis
]
+=
dimsi
[
j
];
}
else
{
PADDLE_ENFORCE_EQ
(
out_dims
[
j
],
dimsi
[
j
],
"After flatting, the %d-th dim should be save "
"except the specify axis."
,
j
);
}
}
}
if
(
out_dims
[
concat_axis
]
<
0
)
{
out_dims
[
concat_axis
]
=
-
1
;
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
}
};
class
TransposeFlattenConcatFusionOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) The input tensor, tensors with rank up to 6 are supported."
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor)The output tensor."
);
AddAttr
<
std
::
vector
<
int
>>
(
"trans_axis"
,
"(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."
);
AddAttr
<
int
>
(
"flatten_axis"
,
"(int)"
"Indicate up to which input dimensions (exclusive) should be"
"flattened to the outer dimension of the output. The value"
"for axis must be in the range [0, R], where R is the rank of"
"the input tensor. When axis = 0, the shape of the output"
"tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
"input tensor is (d_0, d_1, ... d_n)."
);
AddAttr
<
int
>
(
"concat_axis"
,
"The axis along which the input tensors will be concatenated. "
"It should be 0 or 1, since the tensor is 2D after flatting."
);
AddComment
(
R"DOC(
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionOp
,
ops
::
TransposeFlattenConcatFusionOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
0 → 100644
浏览文件 @
6250be4b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
class
TransposeFlattenConcatFusionKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
odims
=
out
->
dims
();
std
::
vector
<
int
>
trans_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
.
Attr
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
.
Attr
<
int
>
(
"concat_axis"
);
int
rank
=
ins
[
0
]
->
dims
().
size
();
// use at least 4D in cudnnTransformTensor
int
max_dim
=
rank
<
4
?
4
:
rank
;
std
::
vector
<
int
>
stride_x
(
max_dim
,
0
);
std
::
vector
<
int
>
stride_y
(
max_dim
,
0
);
std
::
vector
<
int
>
dims_y
(
max_dim
,
0
);
cudnnTensorDescriptor_t
in_desc
;
cudnnTensorDescriptor_t
out_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
));
cudnnDataType_t
cudnn_dtype
=
CudnnDataType
<
T
>::
type
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
T
*
odata
=
out
->
data
<
T
>
();
for
(
size_t
k
=
0
;
k
<
ins
.
size
();
++
k
)
{
auto
perm_shape
=
GetPermuteShape
(
trans_axis
,
ins
[
k
]
->
dims
());
int
osize
=
1
;
auto
idims
=
ins
[
k
]
->
dims
();
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
stride_x
[
i
]
=
1
;
for
(
int
j
=
trans_axis
[
i
]
+
1
;
j
<
rank
;
j
++
)
{
stride_x
[
i
]
*=
idims
[
j
];
}
dims_y
[
i
]
=
perm_shape
[
i
];
osize
*=
perm_shape
[
i
];
}
stride_y
[
rank
-
1
]
=
1
;
for
(
int
i
=
rank
-
2
;
i
>=
0
;
i
--
)
{
if
(((
i
+
1
)
==
flatten_axis
)
&&
(
concat_axis
==
1
))
{
stride_y
[
i
]
=
odims
[
1
];
}
else
{
stride_y
[
i
]
=
stride_y
[
i
+
1
]
*
perm_shape
[
i
+
1
];
}
}
// Since concat is aftern flatten, the output is 2D tensor.
// If concat_axis is 0, each input's permutated tensor is continuous.
// If concat_axis is 1, the stride of 0-th dim of each input's
// permutated tensor is odims()[1].
for
(
int
i
=
rank
;
i
<
max_dim
;
i
++
)
{
stride_x
[
i
]
=
1
;
stride_y
[
i
]
=
1
;
dims_y
[
i
]
=
1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_x
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_y
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
in_desc
,
static_cast
<
const
void
*>
(
ins
[
k
]
->
data
<
T
>
()),
CudnnDataType
<
T
>::
kZero
(),
out_desc
,
static_cast
<
void
*>
(
odata
)));
if
(
concat_axis
==
0
)
{
odata
+=
osize
;
}
else
{
auto
flat_shape
=
GetFlattenShape
(
flatten_axis
,
perm_shape
);
odata
+=
flat_shape
[
1
];
}
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionKernel
<
float
>
,
ops
::
TransposeFlattenConcatFusionKernel
<
double
>
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
0 → 100644
浏览文件 @
6250be4b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace
paddle
{
namespace
operators
{
inline
std
::
vector
<
int32_t
>
GetPermuteShape
(
const
std
::
vector
<
int
>&
axis
,
const
framework
::
DDim
&
in_dims
)
{
std
::
vector
<
int32_t
>
out_dims
(
in_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
out_dims
[
i
]
=
in_dims
[
axis
[
i
]];
}
return
out_dims
;
}
inline
std
::
vector
<
int32_t
>
GetFlattenShape
(
const
int
axis
,
const
std
::
vector
<
int
>&
in_dims
)
{
int64_t
outer
=
1
,
inner
=
1
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_dims
.
size
());
++
i
)
{
if
(
i
<
axis
)
{
outer
*=
in_dims
[
i
];
}
else
{
inner
*=
in_dims
[
i
];
}
}
std
::
vector
<
int32_t
>
out_shape
(
2
);
out_shape
[
0
]
=
outer
;
out_shape
[
1
]
=
inner
;
return
out_shape
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/lookup_sparse_table_op.cc
浏览文件 @
6250be4b
...
...
@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
framework
::
proto
::
VarType
::
FP32
,
"The sparse table only support FP32"
);
w_t
->
Get
(
ids_t
,
out_t
,
true
,
is_test
);
out_t
->
set_lod
(
ids_t
.
lod
());
}
};
...
...
paddle/fluid/operators/sum_op.h
浏览文件 @
6250be4b
...
...
@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_add
;
merge_add
(
context
.
template
device_context
<
DeviceContext
>(),
inputs
,
out
);
out
->
SyncIndex
();
}
else
{
// no data, just set a empty out tensor.
out
->
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
0
}),
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
6250be4b
...
...
@@ -31,6 +31,11 @@ int main(int argc, char** argv) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
));
#elif __clang__
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
#else
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
...
...
python/paddle/fluid/__init__.py
浏览文件 @
6250be4b
...
...
@@ -91,6 +91,7 @@ def __bootstrap__():
"""
import
sys
import
os
import
platform
from
.
import
core
in_test
=
'unittest'
in
sys
.
modules
...
...
@@ -110,14 +111,17 @@ def __bootstrap__():
print
(
'PLEASE USE OMP_NUM_THREADS WISELY.'
,
file
=
sys
.
stderr
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
read_env_flags
=
[
'
use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope
'
,
'use_
mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb
'
,
'
init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'
check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_mkldnn
'
,
'use_
ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'cpu_deterministic'
)
...
...
python/paddle/fluid/contrib/utils/__init__.py
浏览文件 @
6250be4b
...
...
@@ -13,8 +13,10 @@
# limitations under the License.
from
__future__
import
print_function
from
.
import
lookup_table_utils
from
.lookup_table_utils
import
*
from
.
import
hdfs_utils
from
.hdfs_utils
import
*
__all__
=
lookup_table_utils
.
__all__
__all__
=
hdfs_utils
.
__all__
python/paddle/fluid/contrib/utils/lookup_table_utils.py
0 → 100644
浏览文件 @
6250be4b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
time
import
logging
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
io
from
paddle.fluid
import
Program
__all__
=
[
"load_inference_model"
,
"load_persistable_vars"
,
"convert_dist_to_sparse_program"
]
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
)
_logger
=
logging
.
getLogger
(
"lookup_table_utils"
)
_logger
.
setLevel
(
logging
.
INFO
)
model_filename
=
"__model__"
lookup_table_dir
=
"__lookup_table__"
def
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
w
,
out
):
main_program
.
global_block
().
_insert_op
(
index
=
idx
,
type
=
"lookup_sparse_table"
,
inputs
=
{
"Ids"
:
[
ids
],
"W"
:
[
w
]},
outputs
=
{
"Out"
:
[
out
]},
attrs
=
{
"is_distributed"
:
False
,
"is_sparse"
:
True
,
"grad_inplace"
:
False
})
def
__get_prefetch_op_tuples
(
main_program
):
# current lookup tables op is split_ids->prefetch->merge_ids
prefetch_op_tuples
=
None
op_types
=
[
op
.
type
for
op
in
main_program
.
global_block
().
ops
]
for
i
in
range
(
len
(
op_types
)):
if
op_types
[
i
]
==
"prefetch"
:
if
op_types
[
i
-
1
]
==
"split_ids"
and
op_types
[
i
+
1
]
==
"merge_ids"
:
split_ids_op_id
=
i
-
1
split_ids_inputs
=
main_program
.
global_block
().
ops
[
i
-
1
].
input
(
"Ids"
)
prefetch_op_inputs
=
main_program
.
global_block
().
ops
[
i
].
input
(
"X"
)
prefetch_op_outputs
=
main_program
.
global_block
().
ops
[
i
].
output
(
"Out"
)
merge_ids_outputs
=
main_program
.
global_block
().
ops
[
i
+
1
].
output
(
"Out"
)
need_delete_vars
=
[]
need_delete_vars
.
extend
(
prefetch_op_inputs
)
need_delete_vars
.
extend
(
prefetch_op_outputs
)
prefetch_op_tuples
=
(
split_ids_op_id
,
split_ids_inputs
,
merge_ids_outputs
,
need_delete_vars
)
break
return
prefetch_op_tuples
def
convert_dist_to_sparse_program
(
main_program
):
if
not
main_program
.
_distributed_lookup_table
:
_logger
.
warn
(
"There are no distributed lookup tables need to be converted"
)
return
# create table param and grad var in pserver program
origin_emb_var
=
"{}.origin"
.
format
(
main_program
.
_distributed_lookup_table
)
emb_var
=
main_program
.
_distributed_lookup_table
main_program
.
global_block
().
_rename_var
(
emb_var
,
origin_emb_var
)
origin_param_var
=
main_program
.
global_block
().
vars
[
origin_emb_var
]
param_var
=
main_program
.
global_block
().
create_var
(
name
=
emb_var
,
shape
=
origin_param_var
.
shape
,
dtype
=
origin_param_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
# parameter must be selected rows
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
main_program
.
_sync_with_cpp
()
prefetch_op_tuples
=
__get_prefetch_op_tuples
(
main_program
)
split_ids_id
=
prefetch_op_tuples
[
0
]
for
idx
in
range
(
split_ids_id
+
2
,
split_ids_id
-
1
,
-
1
):
main_program
.
global_block
().
_remove_op
(
idx
)
main_program
.
desc
.
flush
()
in_out_pairs
=
zip
(
prefetch_op_tuples
[
1
],
prefetch_op_tuples
[
2
])
for
in_out_pair
in
in_out_pairs
:
idx
=
split_ids_id
ids
=
main_program
.
global_block
().
vars
[
in_out_pair
[
0
]]
out
=
main_program
.
global_block
().
vars
[
in_out_pair
[
1
]]
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
param_var
,
out
)
main_program
.
desc
.
flush
()
return
main_program
def
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var
):
def
_is_checkpoint_var
(
exclude_fluid_vars
=
None
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
exclude_fluid_vars
is
None
:
exclude_fluid_vars
=
[]
def
is_valid
(
var
):
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
if
var
.
name
in
exclude_fluid_vars
:
return
False
return
var
.
persistable
return
is_valid
def
_load_lookup_table_vars
(
executor
,
dirname
,
main_program
,
lookup_table_vars
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
lookup_table_dirname
=
os
.
path
.
join
(
dirname
,
lookup_table_dir
)
emb_var_name
=
lookup_table_vars
[
0
]
emb_var
=
main_program
.
global_block
().
var
(
emb_var_name
)
emb_files
=
[]
for
emb_name
in
os
.
listdir
(
lookup_table_dirname
):
if
emb_var_name
in
emb_name
:
emb_files
.
append
(
emb_name
)
convert_program
=
Program
()
global_block
=
convert_program
.
global_block
()
emb_var
=
global_block
.
create_var
(
name
=
emb_var
.
name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
emb_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
sums
=
[]
for
i
,
emb_file
in
enumerate
(
emb_files
):
var_name
=
"{}_{}"
.
format
(
emb_var
.
name
,
i
)
param_var
=
global_block
.
create_var
(
name
=
var_name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
global_block
.
append_op
(
type
=
'load'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
param_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
lookup_table_dirname
,
var_name
)
})
sums
.
append
(
param_var
)
global_block
.
append_op
(
type
=
'sum'
,
inputs
=
{
"X"
:
sums
},
outputs
=
{
'Out'
:
emb_var
},
attrs
=
{})
global_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
sums
})
executor
.
run
(
convert_program
)
_logger
.
info
(
"Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
lookup_table_vars
=
[
lookup_table_var
]
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
(
lookup_table_vars
),
filename
=
None
)
_load_lookup_table_vars
(
executor
,
dirname
,
program
,
lookup_table_vars
)
_logger
.
info
(
"Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
def
load_inference_model
(
dirname
,
executor
,
lookup_table_var_name
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
local_model
=
os
.
path
.
join
(
dirname
,
model_filename
)
with
open
(
local_model
,
"rb"
)
as
f
:
program_desc_str
=
f
.
read
()
program
=
Program
.
parse_from_string
(
program_desc_str
)
if
not
core
.
_is_program_version_supported
(
program
.
_version
()):
raise
ValueError
(
"Unsupported program version: %d
\n
"
%
program
.
_version
())
# Binary data also need version.
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var_name
)
feed_target_names
=
program
.
desc
.
get_feed_target_names
()
fetch_target_names
=
program
.
desc
.
get_fetch_target_names
()
fetch_targets
=
[
program
.
global_block
().
var
(
name
)
for
name
in
fetch_target_names
]
return
[
program
,
feed_target_names
,
fetch_targets
]
python/paddle/fluid/framework.py
浏览文件 @
6250be4b
...
...
@@ -1698,6 +1698,7 @@ class Program(object):
p
.
_copy_param_info_from
(
self
)
p
.
_copy_data_info_from
(
self
)
p
.
_copy_dist_param_info_from
(
self
)
return
p
def
_prune
(
self
,
targets
):
...
...
@@ -1938,6 +1939,25 @@ class Program(object):
"program, with represent the same topology"
)
self
.
global_block
().
_copy_param_info_from
(
other
.
global_block
())
def
_copy_dist_param_info_from
(
self
,
other
):
"""
Copy the information of distributed information from other program.
Args:
other(Program): Other program
Returns:
None
"""
if
not
isinstance
(
other
,
Program
):
raise
TypeError
(
"_copy_dist_param_info_from should be invoked with "
"Program"
)
self
.
_is_distributed
=
other
.
_is_distributed
self
.
_is_chief
=
other
.
_is_chief
self
.
_slice_vars_and_attrs
=
other
.
_slice_vars_and_attrs
self
.
_endpoints
=
other
.
_endpoints
self
.
_distributed_lookup_table
=
other
.
_distributed_lookup_table
def
_copy_data_info_from
(
self
,
other
):
"""
Copy the information of data variables from other program.
...
...
python/paddle/fluid/io.py
浏览文件 @
6250be4b
...
...
@@ -165,6 +165,7 @@ def save_vars(executor,
save_vars
(
executor
,
main_program
=
main_program
,
dirname
=
dirname
,
vars
=
list
(
filter
(
predicate
,
main_program
.
list_vars
())),
filename
=
filename
)
...
...
@@ -172,11 +173,18 @@ def save_vars(executor,
save_program
=
Program
()
save_block
=
save_program
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
save_var_map
=
{}
for
each_var
in
vars
:
# NOTE: don't save the variable which type is RAW
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
if
each_var
.
name
==
main_program
.
_distributed_lookup_table
:
continue
new_var
=
_clone_var_in_block_
(
save_block
,
each_var
)
if
filename
is
None
:
save_block
.
append_op
(
...
...
@@ -198,6 +206,16 @@ def save_vars(executor,
outputs
=
{},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
attrs
=
{}
attrs
[
'epmap'
]
=
main_program
.
_endpoints
attrs
[
'dir'
]
=
lookup_table_filename
attrs
[
'lookup_table'
]
=
main_program
.
_distributed_lookup_table
save_block
.
append_op
(
type
=
'checkpoint_notify'
,
inputs
=
{},
outputs
=
{},
attrs
=
attrs
)
executor
.
run
(
save_program
)
...
...
@@ -379,11 +397,22 @@ def load_vars(executor,
load_prog
=
Program
()
load_block
=
load_prog
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
load_slice_vars
=
[]
for
each_var
in
main_program
.
_slice_vars_and_attrs
:
load_slice_vars
.
append
(
each_var
[
2
].
name
)
load_var_map
=
{}
for
each_var
in
vars
:
assert
isinstance
(
each_var
,
Variable
)
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
if
each_var
.
name
in
load_slice_vars
:
continue
new_var
=
_clone_var_in_block_
(
load_block
,
each_var
)
if
filename
is
None
:
load_block
.
append_op
(
...
...
@@ -406,9 +435,6 @@ def load_vars(executor,
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
executor
.
run
(
load_prog
)
if
main_program
is
None
:
main_program
=
default_main_program
()
# load slice vars on pserver, if have it.
_load_slice_up_vars
(
executor
,
dirname
,
main_program
.
_slice_vars_and_attrs
)
...
...
@@ -618,13 +644,6 @@ def save_inference_model(dirname,
if
main_program
is
None
:
main_program
=
default_main_program
()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
_save_lookup_tables_by_notify
(
executor
,
lookup_table_filename
,
main_program
.
_distributed_lookup_table
,
main_program
.
_endpoints
)
# when a pserver and a trainer running on the same machine, mkdir may conflict
try
:
os
.
makedirs
(
dirname
)
...
...
@@ -642,6 +661,9 @@ def save_inference_model(dirname,
# it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be
# more flexible.
origin_program
=
main_program
.
clone
()
if
export_for_deployment
:
main_program
=
main_program
.
clone
()
global_block
=
main_program
.
global_block
()
...
...
@@ -666,8 +688,11 @@ def save_inference_model(dirname,
with
open
(
model_basename
+
".main_program"
,
"wb"
)
as
f
:
f
.
write
(
main_program
.
desc
.
serialize_to_string
())
main_program
.
_copy_dist_param_info_from
(
origin_program
)
if
params_filename
is
not
None
:
params_filename
=
os
.
path
.
basename
(
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
...
...
@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
slice_var
=
var_tuple
[
2
]
end
=
start
+
slice_var
.
shape
[
0
]
orig_var_name
=
orig_var
.
name
orig_var
.
name
=
"{}.origin"
.
format
(
orig_var_name
)
clone_orig_var
=
load_block
.
create_var
(
name
=
orig_var
.
name
,
type
=
orig_var
.
type
,
...
...
@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
type
=
'load'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
clone_orig_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
clone_orig_var
.
name
)})
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
orig_var_
name
)})
load_block
.
append_op
(
type
=
"slice"
,
inputs
=
{
'Input'
:
clone_orig_var
},
...
...
@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
'starts'
:
[
start
],
'ends'
:
[
end
]})
need_delete_vars
.
append
(
clone_orig_var
)
load_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
need_delete_vars
},
)
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
6250be4b
...
...
@@ -6972,14 +6972,14 @@ def prelu(x, mode, param_attr=None, name=None):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
y = \max(0, x) + alpha
*
\min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
mode (string): The mode for weight sharing
. It supports all, channel
and element.
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
...
...
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
0 → 100644
浏览文件 @
6250be4b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
class
TestFusionTransposeFlattenConcationOp
(
OpTest
):
def
setUp
(
self
):
self
.
init_test_case
()
self
.
op_type
=
"fusion_transpose_flatten_concat"
ins
=
[]
flats
=
[]
for
i
in
range
(
len
(
self
.
shapes
)):
in_shape
=
self
.
shapes
[
i
]
a
=
np
.
random
.
random
(
in_shape
).
astype
(
"float32"
)
ins
.
append
((
"x%d"
%
i
,
a
))
b
=
a
.
transpose
(
self
.
trans_axis
)
flat_shape
=
(
np
.
prod
(
b
.
shape
[:
self
.
flatten_axis
]),
np
.
prod
(
b
.
shape
[
self
.
flatten_axis
:]))
c
=
b
.
reshape
(
flat_shape
)
flats
.
append
(
c
)
out
=
np
.
concatenate
(
flats
,
axis
=
self
.
concat_axis
)
self
.
inputs
=
{
'X'
:
ins
}
self
.
attrs
=
{
'trans_axis'
:
list
(
self
.
trans_axis
),
'flatten_axis'
:
self
.
flatten_axis
,
'concat_axis'
:
self
.
concat_axis
}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
1e-6
)
else
:
pass
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
17
,
17
),
(
3
,
8
,
7
,
7
),
(
3
,
12
,
5
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase1
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
18
,
17
),
(
3
,
8
,
18
,
7
),
(
6
,
12
,
9
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
1
class
TestCase2
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
0
class
TestCase3
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
3
,
2
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase4
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
),
(
8
,
3
,
9
,
17
),
(
4
,
6
,
9
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
3
)
self
.
flatten_axis
=
3
self
.
concat_axis
=
1
class
TestCase5
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
,
2
),
(
3
,
8
,
2
,
17
,
9
),
(
3
,
17
,
9
,
8
,
2
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
4
,
3
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
6250be4b
...
...
@@ -644,6 +644,9 @@ in a single call.")
else
:
recv_inputs
.
append
(
single_trainer_var
)
self
.
_slice_params_and_optimizes
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
# step 3
# Create a union-find data structure from optimize ops,
# If two ops are connected, we could add these two ops
...
...
@@ -766,7 +769,7 @@ in a single call.")
grad_to_block_id
,
merged_var
,
lr_ops
)
# dedup grad to ids list
# dedup grad to ids list
grad_to_block_id
=
list
(
set
(
grad_to_block_id
))
# append global ops
if
global_ops
:
...
...
@@ -827,8 +830,8 @@ in a single call.")
attrs
=
attrs
)
# add distributed attrs
pserver_program
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
pserver_program
.
_slice_vars_and_attrs
=
list
(
self
.
_slice_params_and_optimizes
.
values
()
)
pserver_program
.
_sync_with_cpp
()
# save pserver program to generate pserver side startup relatively.
...
...
@@ -941,12 +944,12 @@ to transpile() call.")
outputs
=
{
"Out"
:
startup_tmpvar
})
# add slice vars
s_prog
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
s_prog
.
_slice_vars_and_attrs
=
pserver_program
.
_slice_vars_and_attrs
return
s_prog
def
_get_slice_vars_and_attrs
(
self
,
endpoint
):
slice_vars_and_attrs
=
[]
slice_vars_and_attrs
=
{}
block_suffix
=
"block"
for
param
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
orig_var_name
,
block_name
,
_
=
self
.
_get_varname_parts
(
param
.
name
)
...
...
@@ -960,8 +963,7 @@ to transpile() call.")
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
for
slice_var
in
slice_vars
[:
block_idx
]:
skip_dim0
+=
slice_var
.
shape
[
0
]
slice_vars_and_attrs
.
append
([
orig_var
,
skip_dim0
,
param
])
slice_vars_and_attrs
[
param
.
name
]
=
[
orig_var
,
skip_dim0
,
param
]
return
slice_vars_and_attrs
# ====================== private transpiler functions =====================
...
...
@@ -1662,10 +1664,10 @@ to transpile() call.")
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
continue
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
param_var
=
new_inputs
[
"Param"
]
# update accumulator variable shape
param_shape
=
new_inputs
[
"Param"
].
shape
new_shape
=
self
.
_get_optimizer_input_shape
(
opt_op
.
type
,
key
,
var
.
shape
,
param_shape
)
new_shape
=
self
.
_get_optimizer_input_shape
(
opt_op
.
type
,
key
,
var
.
shape
,
param_var
.
shape
)
tmpvar
=
pserver_block
.
create_var
(
name
=
var
.
name
,
persistable
=
var
.
persistable
,
...
...
@@ -1673,6 +1675,13 @@ to transpile() call.")
shape
=
new_shape
)
new_inputs
[
key
]
=
tmpvar
# var shape been changed
if
new_shape
!=
var
.
shape
:
slice_var_args
=
self
.
_slice_params_and_optimizes
[
param_var
.
name
]
self
.
_slice_params_and_optimizes
[
var
.
name
]
=
[
var
,
slice_var_args
[
1
],
tmpvar
]
# change output's ParamOut variable
outputs
=
self
.
_get_output_map_from_op
(
self
.
origin_program
.
global_block
().
vars
,
opt_op
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录