Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
9eb3fb29
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9eb3fb29
编写于
1月 11, 2018
作者:
W
wanghaoshuang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into warpctc
上级
b1af5e43
020630b7
变更
31
隐藏空白更改
内联
并排
Showing
31 changed file
with
926 addition
and
589 deletion
+926
-589
paddle/framework/CMakeLists.txt
paddle/framework/CMakeLists.txt
+6
-4
paddle/framework/data_device_transform.cc
paddle/framework/data_device_transform.cc
+1
-1
paddle/framework/data_device_transform.h
paddle/framework/data_device_transform.h
+0
-0
paddle/framework/data_device_transform_test.cu
paddle/framework/data_device_transform_test.cu
+0
-0
paddle/framework/data_layout.h
paddle/framework/data_layout.h
+0
-1
paddle/framework/data_layout_transform.cc
paddle/framework/data_layout_transform.cc
+82
-0
paddle/framework/data_layout_transform.h
paddle/framework/data_layout_transform.h
+31
-0
paddle/framework/data_transform.cc
paddle/framework/data_transform.cc
+2
-139
paddle/framework/data_transform.h
paddle/framework/data_transform.h
+0
-140
paddle/framework/data_transform_test.cc
paddle/framework/data_transform_test.cc
+0
-168
paddle/framework/data_type_transform.cc
paddle/framework/data_type_transform.cc
+99
-0
paddle/framework/data_type_transform.h
paddle/framework/data_type_transform.h
+31
-0
paddle/framework/lod_tensor.cc
paddle/framework/lod_tensor.cc
+29
-32
paddle/framework/operator.cc
paddle/framework/operator.cc
+0
-1
paddle/framework/tensor_util.h
paddle/framework/tensor_util.h
+18
-18
paddle/framework/var_desc.cc
paddle/framework/var_desc.cc
+1
-1
paddle/gserver/layers/MKLDNNLayer.cpp
paddle/gserver/layers/MKLDNNLayer.cpp
+2
-0
paddle/gserver/layers/MKLDNNLayer.h
paddle/gserver/layers/MKLDNNLayer.h
+2
-0
paddle/operators/edit_distance_op.cc
paddle/operators/edit_distance_op.cc
+98
-0
paddle/operators/edit_distance_op.cu
paddle/operators/edit_distance_op.cu
+149
-0
paddle/operators/edit_distance_op.h
paddle/operators/edit_distance_op.h
+96
-0
paddle/operators/get_places_op.cc
paddle/operators/get_places_op.cc
+18
-15
paddle/operators/parallel_do_op.cc
paddle/operators/parallel_do_op.cc
+38
-18
paddle/operators/shrink_rnn_memory_op.cc
paddle/operators/shrink_rnn_memory_op.cc
+20
-5
paddle/scripts/submit_local.sh.in
paddle/scripts/submit_local.sh.in
+3
-0
python/paddle/v2/__init__.py
python/paddle/v2/__init__.py
+22
-13
python/paddle/v2/fluid/layers/device.py
python/paddle/v2/fluid/layers/device.py
+10
-7
python/paddle/v2/fluid/tests/test_edit_distance_op.py
python/paddle/v2/fluid/tests/test_edit_distance_op.py
+94
-0
python/paddle/v2/fluid/tests/test_parallel_op.py
python/paddle/v2/fluid/tests/test_parallel_op.py
+1
-1
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+67
-24
python/setup.py.in
python/setup.py.in
+6
-1
未找到文件。
paddle/framework/CMakeLists.txt
浏览文件 @
9eb3fb29
...
...
@@ -32,10 +32,12 @@ cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library
(
scope SRCS scope.cc DEPS glog threadpool
)
cc_test
(
scope_test SRCS scope_test.cc DEPS scope
)
cc_library
(
device_data_transform SRCS device_data_transform.cc DEPS tensor
)
cc_library
(
data_device_transform SRCS data_device_transform.cc DEPS tensor
)
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_library
(
data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function
)
cc_library
(
data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows device_data_transform
)
cc_test
(
data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context
)
cc_library
(
data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform
)
cc_library
(
attribute SRCS attribute.cc DEPS framework_proto
)
cc_test
(
program_desc_test SRCS program_desc_test.cc DEPS proto_desc
...
...
@@ -80,5 +82,5 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test
(
op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto
)
cc_test
(
cow_ptr_tests SRCS details/cow_ptr_test.cc
)
nv_test
(
d
evice_data_transform_test SRCS device_data
_transform_test.cu
nv_test
(
d
ata_device_transform_test SRCS data_device
_transform_test.cu
DEPS operator op_registry init math_function
)
paddle/framework/d
evice_data
_transform.cc
→
paddle/framework/d
ata_device
_transform.cc
浏览文件 @
9eb3fb29
...
...
@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/d
evice_data
_transform.h"
#include "paddle/framework/d
ata_device
_transform.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/framework/d
evice_data
_transform.h
→
paddle/framework/d
ata_device
_transform.h
浏览文件 @
9eb3fb29
文件已移动
paddle/framework/d
evice_data
_transform_test.cu
→
paddle/framework/d
ata_device
_transform_test.cu
浏览文件 @
9eb3fb29
文件已移动
paddle/framework/data_layout.h
浏览文件 @
9eb3fb29
...
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/platform/enforce.h"
#include <iostream>
#include "paddle/platform/enforce.h"
...
...
paddle/framework/data_layout_transform.cc
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/data_layout_transform.h"
#include "paddle/framework/tensor.h"
#include "paddle/operators/math/math_function.h"
namespace
paddle
{
namespace
framework
{
struct
CastDataLayout
{
CastDataLayout
(
const
platform
::
DeviceContext
*
ctx
,
const
std
::
vector
<
int
>&
axis
,
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
)
:
in_
(
in
),
out_
(
out
),
ctx_
(
ctx
),
axis_
(
axis
)
{}
const
framework
::
Tensor
in_
;
framework
::
Tensor
*
out_
;
const
platform
::
DeviceContext
*
ctx_
;
const
std
::
vector
<
int
>
axis_
;
template
<
typename
T
>
void
operator
()()
{
auto
place
=
ctx_
->
GetPlace
();
if
(
platform
::
is_cpu_place
(
place
))
{
operators
::
math
::
Transpose
<
platform
::
CPUDeviceContext
,
T
,
4
>
trans4
;
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans4
(
*
context
,
in_
,
out_
,
axis_
);
}
else
{
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
}
}
};
void
TransDataLayout
(
const
std
::
vector
<
int
>&
axis
,
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
)
{
PADDLE_ENFORCE
(
in
.
IsType
<
Tensor
>
(),
"Only support Tensor transform!."
);
PADDLE_ENFORCE
(
platform
::
places_are_same_class
(
kernel_pair
.
first
.
place_
,
kernel_pair
.
second
.
place_
),
"TransDataLayout only support DataLayout transform on same place!"
);
PADDLE_ENFORCE
(
kernel_pair
.
first
.
data_type_
==
kernel_pair
.
second
.
data_type_
,
"TransDataLayout only support Datatype are same!"
);
auto
src
=
in
.
Get
<
Tensor
>
();
auto
*
dst
=
out
->
GetMutable
<
Tensor
>
();
PADDLE_ENFORCE
(
arity
(
src
.
dims
())
==
4
,
"Input Arity Only Suppport 4!"
);
auto
src_dim
=
src
.
dims
();
std
::
vector
<
int64_t
>
dst_dim
;
dst_dim
.
resize
(
axis
.
size
());
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
dst_dim
[
i
]
=
src_dim
[
axis
[
i
]];
}
dst
->
Resize
(
make_ddim
(
dst_dim
));
auto
place
=
kernel_pair
.
second
.
place_
;
dst
->
mutable_data
(
place
,
src
.
type
());
auto
src_type
=
kernel_pair
.
first
.
data_type_
;
framework
::
VisitDataType
(
src_type
,
CastDataLayout
(
ctx
,
axis
,
src
,
dst
));
dst
->
set_layout
(
kernel_pair
.
second
.
data_layout_
);
}
}
// namespace framework
}
// namespace paddle
paddle/framework/data_layout_transform.h
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/variable.h"
namespace
paddle
{
namespace
framework
{
using
KernelTypePair
=
std
::
pair
<
OpKernelType
,
OpKernelType
>
;
void
TransDataLayout
(
const
std
::
vector
<
int
>&
axis
,
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
);
}
// namespace framework
}
// namespace paddle
paddle/framework/data_transform.cc
浏览文件 @
9eb3fb29
...
...
@@ -11,22 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <functional>
#include "paddle/framework/data_transform.h"
#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h"
#include "paddle/framework/data_device_transform.h"
namespace
paddle
{
namespace
framework
{
DataTransformFnMap
&
DataTransformFnMap
::
Instance
()
{
static
DataTransformFnMap
data_transform_map
;
return
data_transform_map
;
}
Tensor
*
DataTransform
(
const
OpKernelType
&
expected_kernel_type
,
const
OpKernelType
&
kernel_type_for_var
,
const
Tensor
&
input_tensor
)
{
...
...
@@ -58,134 +50,5 @@ void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
}
}
auto
KernelFP32
=
OpKernelType
(
proto
::
DataType
::
FP32
,
platform
::
CPUPlace
(),
DataLayout
::
kNHWC
,
LibraryType
::
kPlain
);
auto
KernelFP64
=
OpKernelType
(
proto
::
DataType
::
FP64
,
platform
::
CPUPlace
(),
DataLayout
::
kNHWC
,
LibraryType
::
kPlain
);
auto
KernelNHWC
=
OpKernelType
(
proto
::
DataType
::
FP64
,
platform
::
CPUPlace
(),
DataLayout
::
kNHWC
,
LibraryType
::
kPlain
);
auto
KernelNCHW
=
OpKernelType
(
proto
::
DataType
::
FP64
,
platform
::
CPUPlace
(),
DataLayout
::
kNCHW
,
LibraryType
::
kPlain
);
// TODO(dzhwinter): Only for testing multiple op kernel.
// Dummy transform function for library_type
// should be removed.
auto
KernelPlain
=
OpKernelType
(
proto
::
DataType
::
FP32
,
platform
::
CUDAPlace
(
0
),
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
KernelCUDNN
=
OpKernelType
(
proto
::
DataType
::
FP32
,
platform
::
CUDAPlace
(
0
),
DataLayout
::
kAnyLayout
,
LibraryType
::
kCUDNN
);
void
DummyTrans
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
)
{
PADDLE_ENFORCE
(
in
.
IsType
<
Tensor
>
(),
"Only Support Tensor transform!."
);
PADDLE_ENFORCE
(
platform
::
places_are_same_class
(
kernel_pair
.
first
.
place_
,
kernel_pair
.
second
.
place_
),
"TransDataType Only Support DataType transform on same place!"
);
auto
src
=
in
.
Get
<
Tensor
>
();
auto
*
dst
=
out
->
GetMutable
<
Tensor
>
();
*
dst
=
src
;
}
void
TransDataType
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
)
{
PADDLE_ENFORCE
(
in
.
IsType
<
Tensor
>
(),
"Only Support Tensor transform!."
);
PADDLE_ENFORCE
(
platform
::
places_are_same_class
(
kernel_pair
.
first
.
place_
,
kernel_pair
.
second
.
place_
),
"TransDataType Only Support DataType transform on same place!"
);
auto
src
=
in
.
Get
<
Tensor
>
();
auto
*
dst
=
out
->
GetMutable
<
Tensor
>
();
auto
dims
=
src
.
dims
();
dst
->
Resize
(
dims
);
auto
dst_type
=
kernel_pair
.
second
.
data_type_
;
auto
src_type
=
kernel_pair
.
first
.
data_type_
;
switch
(
src_type
)
{
case
proto
::
DataType
::
FP32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
FP64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
double
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
INT32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
INT64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int64_t
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
BOOL
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
bool
>
(
src
,
dst
,
ctx
));
break
;
default:
PADDLE_THROW
(
"Not support type %d"
,
src_type
);
}
}
void
TransDataLayout
(
const
std
::
vector
<
int
>&
axis
,
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
)
{
PADDLE_ENFORCE
(
in
.
IsType
<
Tensor
>
(),
"Only support Tensor transform!."
);
PADDLE_ENFORCE
(
platform
::
places_are_same_class
(
kernel_pair
.
first
.
place_
,
kernel_pair
.
second
.
place_
),
"TransDataLayout only support DataLayout transform on same place!"
);
PADDLE_ENFORCE
(
kernel_pair
.
first
.
data_type_
==
kernel_pair
.
second
.
data_type_
,
"TransDataLayout only support Datatype are same!"
);
auto
src
=
in
.
Get
<
Tensor
>
();
auto
*
dst
=
out
->
GetMutable
<
Tensor
>
();
PADDLE_ENFORCE
(
arity
(
src
.
dims
())
==
4
,
"Input Arity Only Suppport 4!"
);
auto
src_dim
=
src
.
dims
();
std
::
vector
<
int64_t
>
dst_dim
;
dst_dim
.
resize
(
axis
.
size
());
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
dst_dim
[
i
]
=
src_dim
[
axis
[
i
]];
}
dst
->
Resize
(
make_ddim
(
dst_dim
));
auto
place
=
kernel_pair
.
second
.
place_
;
dst
->
mutable_data
(
place
,
src
.
type
());
auto
src_type
=
kernel_pair
.
first
.
data_type_
;
framework
::
VisitDataType
(
src_type
,
CastDataLayout
(
ctx
,
axis
,
src
,
dst
));
dst
->
set_layout
(
kernel_pair
.
second
.
data_layout_
);
}
}
// namespace framework
}
// namespace paddle
namespace
f
=
paddle
::
framework
;
namespace
{
std
::
vector
<
int
>
NHWC2NCHW
=
{
0
,
3
,
1
,
2
};
std
::
vector
<
int
>
NCHW2NHWC
=
{
0
,
2
,
3
,
1
};
}
REGISTER_DATA_TRANSFORM_FN
(
f
::
KernelFP32
,
f
::
KernelFP64
,
f
::
TransDataType
);
REGISTER_DATA_TRANSFORM_FN
(
f
::
KernelPlain
,
f
::
KernelCUDNN
,
f
::
DummyTrans
);
REGISTER_DATA_TRANSFORM_FN
(
f
::
KernelCUDNN
,
f
::
KernelPlain
,
f
::
DummyTrans
);
REGISTER_DATA_TRANSFORM_FN
(
f
::
KernelNHWC
,
f
::
KernelNCHW
,
std
::
bind
(
f
::
TransDataLayout
,
NHWC2NCHW
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
));
REGISTER_DATA_TRANSFORM_FN
(
f
::
KernelNCHW
,
f
::
KernelNHWC
,
std
::
bind
(
f
::
TransDataLayout
,
NCHW2NHWC
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
));
paddle/framework/data_transform.h
浏览文件 @
9eb3fb29
...
...
@@ -30,26 +30,6 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
using
KernelTypePair
=
std
::
pair
<
OpKernelType
,
OpKernelType
>
;
using
DataTransformFn
=
std
::
function
<
void
(
const
platform
::
DeviceContext
*
,
const
KernelTypePair
&
,
const
Variable
&
,
Variable
*
)
>
;
struct
KernelTypePairHash
{
static
void
HashCombine
(
const
OpKernelType
&
t
,
std
::
size_t
*
seed
)
{
OpKernelType
::
Hash
kernel_type_hasher
;
(
*
seed
)
^=
kernel_type_hasher
(
t
)
+
0x9e3779b9
+
(
*
seed
<<
6
)
+
(
*
seed
>>
2
);
}
size_t
operator
()(
const
KernelTypePair
&
kernel_pair
)
const
{
std
::
size_t
seed
=
0
;
HashCombine
(
kernel_pair
.
first
,
&
seed
);
HashCombine
(
kernel_pair
.
second
,
&
seed
);
return
seed
;
}
};
Tensor
*
DataTransform
(
const
OpKernelType
&
expected_kernel_type
,
const
OpKernelType
&
kernel_type_for_var
,
const
Tensor
&
input_tensor
);
...
...
@@ -57,125 +37,5 @@ Tensor* DataTransform(const OpKernelType& expected_kernel_type,
void
CopyVariableWithTensor
(
const
Variable
&
in_var
,
const
Tensor
&
tensor
,
Variable
&
out_var
);
template
<
typename
InType
,
typename
OutType
>
struct
CastDataTypeFunctor
{
HOSTDEVICE
inline
OutType
operator
()(
InType
in
)
const
{
return
static_cast
<
OutType
>
(
in
);
}
};
template
<
typename
InType
>
struct
CastDataType
{
CastDataType
(
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
,
const
platform
::
DeviceContext
*
ctx
)
:
in_
(
in
),
out_
(
out
),
ctx_
(
ctx
)
{}
const
framework
::
Tensor
in_
;
framework
::
Tensor
*
out_
;
const
platform
::
DeviceContext
*
ctx_
;
template
<
typename
OutType
>
void
operator
()()
{
auto
place
=
ctx_
->
GetPlace
();
auto
*
in_begin
=
in_
.
data
<
InType
>
();
auto
numel
=
in_
.
numel
();
auto
*
in_end
=
in_begin
+
numel
;
auto
*
out_begin
=
out_
->
mutable_data
<
OutType
>
(
place
);
if
(
platform
::
is_cpu_place
(
place
))
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
}
else
{
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
}
}
};
struct
CastDataLayout
{
CastDataLayout
(
const
platform
::
DeviceContext
*
ctx
,
const
std
::
vector
<
int
>&
axis
,
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
)
:
in_
(
in
),
out_
(
out
),
ctx_
(
ctx
),
axis_
(
axis
)
{}
const
framework
::
Tensor
in_
;
framework
::
Tensor
*
out_
;
const
platform
::
DeviceContext
*
ctx_
;
const
std
::
vector
<
int
>
axis_
;
template
<
typename
T
>
void
operator
()()
{
auto
place
=
ctx_
->
GetPlace
();
if
(
platform
::
is_cpu_place
(
place
))
{
operators
::
math
::
Transpose
<
platform
::
CPUDeviceContext
,
T
,
4
>
trans4
;
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans4
(
*
context
,
in_
,
out_
,
axis_
);
}
else
{
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
}
}
};
using
DataTransformMap
=
std
::
unordered_map
<
KernelTypePair
,
DataTransformFn
,
KernelTypePairHash
>
;
class
DataTransformFnMap
{
public:
static
DataTransformFnMap
&
Instance
();
bool
Has
(
const
KernelTypePair
&
key_pair
)
const
{
return
map_
.
find
(
key_pair
)
!=
map_
.
end
();
}
void
Insert
(
const
OpKernelType
&
left
,
const
OpKernelType
&
right
,
const
DataTransformFn
&
data_tranform_fn
)
{
Insert
(
std
::
make_pair
(
left
,
right
),
data_tranform_fn
);
}
void
Insert
(
const
KernelTypePair
&
kernel_type_pair
,
const
DataTransformFn
&
data_tranform_fn
)
{
PADDLE_ENFORCE
(
!
Has
(
kernel_type_pair
),
"KernelTypePair %s has been registered"
,
""
);
map_
.
insert
({
kernel_type_pair
,
data_tranform_fn
});
}
const
DataTransformFn
&
Get
(
const
KernelTypePair
&
key_pair
)
const
{
auto
data_transformer
=
GetNullable
(
key_pair
);
PADDLE_ENFORCE_NOT_NULL
(
data_transformer
,
"DataTransformFn should not be NULL"
);
return
*
data_transformer
;
}
const
DataTransformFn
*
GetNullable
(
const
KernelTypePair
&
key_pair
)
const
{
auto
it
=
map_
.
find
(
key_pair
);
if
(
it
==
map_
.
end
())
{
return
nullptr
;
}
else
{
return
&
(
it
->
second
);
}
}
const
DataTransformMap
&
Map
()
const
{
return
map_
;
}
private:
DataTransformFnMap
()
=
default
;
DataTransformMap
map_
;
DISABLE_COPY_AND_ASSIGN
(
DataTransformFnMap
);
};
// generate unique name with __LINE__
// refs https://stackoverflow.com/questions/1597007
#define TOKENPASTE(x, y) x##y
#define TOKENPASTE2(x, y) TOKENPASTE(x, y)
#define REGISTER_DATA_TRANSFORM_FN(from, to, fn) \
static int TOKENPASTE2(fn_, __LINE__)() { \
::paddle::framework::DataTransformFnMap::Instance().Insert(from, to, fn); \
return 0; \
} \
static int TOKENPASTE2(var_, __LINE__) __attribute__((unused)) = \
TOKENPASTE2(fn_, __LINE__)()
}
// namespace framework
}
// namespace paddle
paddle/framework/data_transform_test.cc
已删除
100644 → 0
浏览文件 @
b1af5e43
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <array>
#include <vector>
#include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
using
namespace
platform
;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std
::
array
<
proto
::
DataType
,
2
>
kDataType
=
{
{
proto
::
DataType
::
FP32
,
proto
::
DataType
::
FP64
}};
std
::
array
<
Place
,
2
>
kPlace
=
{{
CPUPlace
(),
CUDAPlace
(
0
)}};
std
::
array
<
DataLayout
,
2
>
kDataLayout
=
{{
DataLayout
::
kNHWC
,
DataLayout
::
kNCHW
,
}};
std
::
array
<
LibraryType
,
2
>
kLibraryType
=
{{
LibraryType
::
kPlain
,
LibraryType
::
kMKLDNN
,
}};
OpKernelType
GenFromBit
(
const
std
::
vector
<
bool
>
bits
)
{
return
OpKernelType
(
kDataType
[
bits
[
0
]],
kPlace
[
bits
[
1
]],
kDataLayout
[
bits
[
2
]],
kLibraryType
[
bits
[
3
]]);
}
int
test_value
=
0
;
auto
kernel0
=
GenFromBit
({
0
,
0
,
0
,
0
});
auto
kernel1
=
GenFromBit
({
0
,
0
,
0
,
1
});
auto
kernel2
=
GenFromBit
({
0
,
0
,
1
,
0
});
auto
kernel3
=
GenFromBit
({
0
,
0
,
1
,
1
});
void
TransDataType_t
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
p
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
++
;
}
void
TransDataLayout_t
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
p
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
--
;
}
void
TransLibraryType_t
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
p
,
const
Variable
&
in
,
Variable
*
out
)
{
test_value
+=
2
;
}
}
// namespace framework
}
// namespace paddle
namespace
frw
=
paddle
::
framework
;
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel0
,
frw
::
kernel1
,
frw
::
TransDataType_t
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel1
,
frw
::
kernel2
,
frw
::
TransDataLayout_t
);
REGISTER_DATA_TRANSFORM_FN
(
frw
::
kernel0
,
frw
::
kernel2
,
frw
::
TransLibraryType_t
);
TEST
(
DataTransform
,
Register
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
&
instance
=
DataTransformFnMap
::
Instance
();
paddle
::
framework
::
Variable
in
;
paddle
::
framework
::
Variable
out
;
DeviceContext
*
ctx
=
new
CPUDeviceContext
();
auto
pair0
=
std
::
make_pair
(
frw
::
kernel0
,
frw
::
kernel1
);
instance
.
Get
(
pair0
)(
ctx
,
pair0
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
1
);
auto
pair1
=
std
::
make_pair
(
frw
::
kernel1
,
frw
::
kernel2
);
instance
.
Get
(
pair1
)(
ctx
,
pair1
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
0
);
auto
pair3
=
std
::
make_pair
(
frw
::
kernel0
,
frw
::
kernel2
);
instance
.
Get
(
pair3
)(
ctx
,
pair3
,
in
,
&
out
);
ASSERT_EQ
(
test_value
,
2
);
}
TEST
(
DataTransform
,
DataLayout
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
&
instance
=
DataTransformFnMap
::
Instance
();
Variable
in
;
Variable
out
;
Tensor
*
src
=
in
.
GetMutable
<
Tensor
>
();
src
->
mutable_data
<
double
>
(
make_ddim
({
2
,
3
,
1
,
2
}),
CPUPlace
());
src
->
set_layout
(
DataLayout
::
kNHWC
);
DeviceContext
*
ctx
=
new
CPUDeviceContext
();
{
auto
kernel1
=
GenFromBit
({
1
,
0
,
0
,
0
});
auto
kernel2
=
GenFromBit
({
1
,
0
,
1
,
0
});
auto
pair0
=
std
::
make_pair
(
kernel1
,
kernel2
);
instance
.
Get
(
pair0
)(
ctx
,
pair0
,
in
,
&
out
);
}
Tensor
dst
=
out
.
Get
<
Tensor
>
();
EXPECT_TRUE
(
dst
.
layout
()
==
DataLayout
::
kNCHW
);
EXPECT_TRUE
(
dst
.
dims
()
==
make_ddim
({
2
,
2
,
3
,
1
}));
{
auto
kernel1
=
GenFromBit
({
1
,
0
,
1
,
0
});
auto
kernel2
=
GenFromBit
({
1
,
0
,
0
,
0
});
auto
pair0
=
std
::
make_pair
(
kernel1
,
kernel2
);
instance
.
Get
(
pair0
)(
ctx
,
pair0
,
out
,
&
in
);
}
EXPECT_TRUE
(
src
->
layout
()
==
DataLayout
::
kNHWC
);
EXPECT_TRUE
(
src
->
dims
()
==
make_ddim
({
2
,
3
,
1
,
2
}));
}
TEST
(
DataTransform
,
DataType
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
&
instance
=
DataTransformFnMap
::
Instance
();
DeviceContext
*
ctx
=
new
CPUDeviceContext
();
Variable
in
;
Variable
out
;
Tensor
*
src
=
in
.
GetMutable
<
Tensor
>
();
float
*
ptr
=
src
->
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
CPUPlace
());
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
ptr
[
i
]
=
i
/
3
;
}
{
auto
kernel1
=
GenFromBit
({
0
,
0
,
0
,
0
});
auto
kernel2
=
GenFromBit
({
1
,
0
,
0
,
0
});
auto
pair0
=
std
::
make_pair
(
kernel1
,
kernel2
);
instance
.
Get
(
pair0
)(
ctx
,
pair0
,
in
,
&
out
);
}
Tensor
dst
=
out
.
Get
<
Tensor
>
();
EXPECT_TRUE
(
dst
.
data
<
double
>
()
!=
nullptr
);
}
paddle/framework/data_type_transform.cc
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/data_type_transform.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/platform/transform.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
InType
,
typename
OutType
>
struct
CastDataTypeFunctor
{
HOSTDEVICE
inline
OutType
operator
()(
InType
in
)
const
{
return
static_cast
<
OutType
>
(
in
);
}
};
template
<
typename
InType
>
struct
CastDataType
{
CastDataType
(
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
,
const
platform
::
DeviceContext
*
ctx
)
:
in_
(
in
),
out_
(
out
),
ctx_
(
ctx
)
{}
const
framework
::
Tensor
in_
;
framework
::
Tensor
*
out_
;
const
platform
::
DeviceContext
*
ctx_
;
template
<
typename
OutType
>
void
operator
()()
{
auto
place
=
ctx_
->
GetPlace
();
auto
*
in_begin
=
in_
.
data
<
InType
>
();
auto
numel
=
in_
.
numel
();
auto
*
in_end
=
in_begin
+
numel
;
auto
*
out_begin
=
out_
->
mutable_data
<
OutType
>
(
place
);
if
(
platform
::
is_cpu_place
(
place
))
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
}
else
{
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
}
}
};
void
TransDataType
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
)
{
PADDLE_ENFORCE
(
in
.
IsType
<
Tensor
>
(),
"Only Support Tensor transform!."
);
PADDLE_ENFORCE
(
platform
::
places_are_same_class
(
kernel_pair
.
first
.
place_
,
kernel_pair
.
second
.
place_
),
"TransDataType Only Support DataType transform on same place!"
);
auto
src
=
in
.
Get
<
Tensor
>
();
auto
*
dst
=
out
->
GetMutable
<
Tensor
>
();
auto
dims
=
src
.
dims
();
dst
->
Resize
(
dims
);
auto
dst_type
=
kernel_pair
.
second
.
data_type_
;
auto
src_type
=
kernel_pair
.
first
.
data_type_
;
switch
(
src_type
)
{
case
proto
::
DataType
::
FP32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
FP64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
double
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
INT32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
INT64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int64_t
>
(
src
,
dst
,
ctx
));
break
;
case
proto
::
DataType
::
BOOL
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
bool
>
(
src
,
dst
,
ctx
));
break
;
default:
PADDLE_THROW
(
"Not support type %d"
,
src_type
);
}
}
}
// namespace framework
}
// namespace paddle
paddle/framework/data_type_transform.h
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/variable.h"
#include "paddle/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
using
KernelTypePair
=
std
::
pair
<
OpKernelType
,
OpKernelType
>
;
void
TransDataType
(
const
platform
::
DeviceContext
*
ctx
,
const
KernelTypePair
&
kernel_pair
,
const
Variable
&
in
,
Variable
*
out
);
}
// namespace framework
}
// namespace paddle
paddle/framework/lod_tensor.cc
浏览文件 @
9eb3fb29
...
...
@@ -44,9 +44,19 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
LoDTensor
&
t
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
t
.
place
()));
PADDLE_ENFORCE
(
t
.
type
().
hash_code
()
==
typeid
(
float
).
hash_code
());
if
(
!
platform
::
is_cpu_place
(
t
.
place
()))
{
LoDTensor
tt
;
framework
::
Copy
(
t
,
platform
::
CPUPlace
(),
&
tt
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
t
.
place
());
dev_ctx
.
Wait
();
os
<<
tt
;
return
os
;
}
os
<<
"dim: "
<<
t
.
dims
()
<<
"
\n
"
;
os
<<
"lod: "
<<
t
.
lod
()
<<
"
\n
"
;
...
...
@@ -211,38 +221,23 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
DeserializeFromStream
(
is
,
static_cast
<
Tensor
*>
(
tensor
),
dev_ctx
);
}
// TODO(tonyyang-svail): make this function support LoD
std
::
vector
<
LoDTensor
>
LoDTensor
::
SplitLoDTensor
(
const
std
::
vector
<
platform
::
Place
>
places
)
const
{
check_memory_size
();
// PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
// , "Disable parallel lod for now");
PADDLE_ENFORCE
(
lod
().
empty
(),
"Disable parallel lod for now"
);
PADDLE_ENFORCE
(
dims
()[
0
]
%
places
.
size
()
==
0
,
"Batch size should be divided by places size"
);
std
::
vector
<
LoDTensor
>
lods
;
for
(
size_t
place_idx
=
0
;
place_idx
<
places
.
size
();
++
place_idx
)
{
size_t
begin
=
place_idx
*
dims
()[
0
]
/
places
.
size
();
size_t
end
=
(
place_idx
+
1
)
*
dims
()[
0
]
/
places
.
size
();
auto
src
=
Slice
(
static_cast
<
int
>
(
begin
),
static_cast
<
int
>
(
end
));
int
begin
=
place_idx
*
dims
()[
0
]
/
places
.
size
();
int
end
=
(
place_idx
+
1
)
*
dims
()[
0
]
/
places
.
size
();
LoDTensor
dst
;
dst
.
Resize
(
src
.
dims
());
auto
src
=
Slice
(
begin
,
end
);
auto
&
dst_place
=
places
[
place_idx
];
auto
dst_ptr
=
dst
.
mutable_data
(
dst_place
,
src
.
type
());
// TODO(tonyyang-svail):
// change the following to framework::Copy
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
else
{
PADDLE_THROW
(
"Not Implemented"
);
}
LoDTensor
dst
;
framework
::
Copy
(
src
,
dst_place
,
&
dst
);
lods
.
emplace_back
(
dst
);
}
...
...
@@ -250,28 +245,30 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
return
lods
;
}
// TODO(tonyyang-svail): make this function support LoD
void
LoDTensor
::
MergeLoDTensor
(
const
std
::
vector
<
const
LoDTensor
*>
&
lod_tensors
,
platform
::
Place
place
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
place
));
const
std
::
vector
<
const
LoDTensor
*>
&
lod_tensors
,
platform
::
Place
dst_place
)
{
PADDLE_ENFORCE
(
!
lod_tensors
.
empty
());
framework
::
DDim
new_dim
=
lod_tensors
[
0
]
->
dims
();
std
::
type_index
new_type
=
lod_tensors
[
0
]
->
type
();
auto
new_layout
=
lod_tensors
[
0
]
->
layout
();
for
(
auto
*
lod
:
lod_tensors
)
{
PADDLE_ENFORCE
(
new_dim
==
lod
->
dims
());
PADDLE_ENFORCE
(
new_type
==
lod
->
type
());
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
lod
->
place
()
));
PADDLE_ENFORCE
(
new_layout
==
lod
->
layout
(
));
}
new_dim
[
0
]
*=
lod_tensors
.
size
();
Resize
(
new_dim
);
set_layout
(
new_layout
);
auto
*
dst_ptr
=
reinterpret_cast
<
uint8_t
*>
(
mutable_data
(
place
,
new_type
));
mutable_data
(
dst_place
,
new_type
);
int
begin
=
0
;
for
(
auto
*
src
:
lod_tensors
)
{
auto
size
=
src
->
numel
()
*
SizeOfType
(
src
->
type
());
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src
->
place
()),
src
->
data
<
void
>
(),
size
);
dst_ptr
+=
size
;
int
end
=
begin
+
src
->
dims
()[
0
];
auto
dst
=
Slice
(
begin
,
end
);
framework
::
Copy
(
*
src
,
dst_place
,
&
dst
);
begin
=
end
;
}
}
...
...
paddle/framework/operator.cc
浏览文件 @
9eb3fb29
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include <algorithm>
#include "paddle/framework/data_transform.h"
#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/executor.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/shape_inference.h"
...
...
paddle/framework/tensor_util.h
浏览文件 @
9eb3fb29
...
...
@@ -31,9 +31,10 @@ namespace framework {
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline
void
Copy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"Copy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
...
...
@@ -88,26 +89,25 @@ inline void Copy(const Tensor& src, const platform::Place& dst_place,
}
/**
* @brief Copy supports CPU <-> CPU
* @brief Wrapper on
* Copy(const Tensor& src, const platform::Place& dst_place,
* const platform::DeviceContext& ctx, Tensor* dst);
*
* @param[in] src The external tensor.
* @param[in] dst_place The dst place.
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline
void
Copy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
));
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
;
if
(
platform
::
is_gpu_place
(
src
.
place
()))
{
dev_ctx
=
pool
.
Get
(
src
.
place
());
}
else
{
dev_ctx
=
pool
.
Get
(
dst_place
);
}
Copy
(
src
,
dst_place
,
*
dev_ctx
,
dst
);
}
/**
...
...
paddle/framework/var_desc.cc
浏览文件 @
9eb3fb29
...
...
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case
proto
::
VarDesc
::
LOD_TENSOR_ARRAY
:
return
desc_
.
tensor_array
().
tensor
();
default:
PADDLE_THROW
(
"The type of var
'"
,
this
->
Name
(),
"' is unsupported."
);
PADDLE_THROW
(
"The type of var
%s is unsupported."
,
this
->
Name
()
);
}
}
...
...
paddle/gserver/layers/MKLDNNLayer.cpp
浏览文件 @
9eb3fb29
...
...
@@ -132,6 +132,8 @@ void MKLDNNLayer::reshapeInput(int& batchsize,
if
(
w
!=
0
)
{
width
=
w
;
}
height
=
height
!=
0
?
height
:
1
;
width
=
width
!=
0
?
width
:
1
;
}
void
MKLDNNLayer
::
reshapeOutput
(
size_t
height
,
size_t
width
)
{
...
...
paddle/gserver/layers/MKLDNNLayer.h
浏览文件 @
9eb3fb29
...
...
@@ -98,6 +98,8 @@ protected:
public:
explicit
MKLDNNLayer
(
const
LayerConfig
&
config
)
:
Layer
(
config
),
ih_
(
0
),
iw_
(
0
),
condition_
(
0
),
needResetBwd_
(
true
),
outputOnlyMKLDNN_
(
false
),
...
...
paddle/operators/edit_distance_op.cc
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/edit_distance_op.h"
namespace
paddle
{
namespace
operators
{
class
EditDistanceOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Hyps"
),
"Input(Hyps) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Refs"
),
"Input(Refs) shouldn't be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) shouldn't be null."
);
auto
hyp_dims
=
ctx
->
GetInputDim
(
"Hyps"
);
auto
ref_dims
=
ctx
->
GetInputDim
(
"Refs"
);
PADDLE_ENFORCE
(
hyp_dims
.
size
()
==
2
&&
hyp_dims
[
1
]
==
1
,
"Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
"equal to 1."
);
PADDLE_ENFORCE
(
ref_dims
.
size
()
==
2
&&
ref_dims
[
1
]
==
1
,
"Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
"equal to 1."
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"Refs"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
proto
::
DataType
::
FP32
,
ctx
.
device_context
());
}
};
class
EditDistanceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
EditDistanceOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"Hyps"
,
"(2-D LoDTensor<int>, 2nd dim. equal to 1) "
"The indices for hypothesis strings."
);
AddInput
(
"Refs"
,
"(2-D LoDTensor<int>, 2nd dim. equal to 1) "
"The indices for reference strings."
);
AddAttr
<
bool
>
(
"normalized"
,
"(bool, default false) Indicated whether to normalize "
"the edit distance by the length of reference string."
)
.
SetDefault
(
false
);
AddOutput
(
"Out"
,
"(2-D Tensor with shape [`batch_size` x 1]) "
"The output edit distances of EditDistance operator."
);
AddComment
(
R"DOC(
EditDistance operator computes the edit distances between a batch of hypothesis
strings and their references.
Edit distance, also called Levenshtein distance, measures how dissimilar two strings
are by counting the minimum number of operations to transform one string into anthor.
Here the operations include insertion, deletion, and substitution. For example,
given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
is 3 for A will be transformed into B at least after two substitutions and one
insertion:
"kitten" -> "sitten" -> "sittin" -> "sitting"
Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
number denoted by `batch_size`, and the separation is specified by the LoD information.
And the `batch_size` reference strings are arranged in order in the same way in the
LoDTensor Input(Refs).
Output(Out) contains the `batch_size` results and each stands for the edit stance
for a pair of strings respectively. If Attr(normalized) is true, the edit distance
will be divided by the length of reference string.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
edit_distance
,
ops
::
EditDistanceOp
,
ops
::
EditDistanceOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
edit_distance
,
ops
::
EditDistanceKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
);
paddle/operators/edit_distance_op.cu
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/framework/op_registry.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/gpu_info.h"
namespace
paddle
{
namespace
operators
{
using
platform
::
PADDLE_CUDA_NUM_THREADS
;
template
<
typename
T
>
__global__
void
FillFirstRow
(
T
*
dist
,
const
int
N
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
idx
<
N
+
1
)
{
dist
[
idx
]
=
idx
;
}
}
template
<
typename
T
>
__global__
void
FillFirstColumn
(
T
*
dist
,
const
int
M
,
const
int
N
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
idx
<
M
+
1
)
{
dist
[
idx
*
(
N
+
1
)]
=
idx
;
}
}
template
<
typename
T
>
__global__
void
Levenshtein
(
T
*
dist
,
const
int
*
x1
,
const
int
*
x2
,
const
int
M
,
const
int
N
,
const
int
start
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
offset
=
N
;
int
index
=
start
+
idx
*
offset
;
int
row
=
index
/
(
N
+
1
);
int
col
=
index
%
(
N
+
1
);
if
(
row
>
0
&&
col
>
0
&&
row
<
M
+
1
&&
col
<
N
+
1
)
{
int
cost
=
x1
[
row
-
1
]
==
x2
[
col
-
1
]
?
0
:
1
;
int
dels
=
dist
[(
row
-
1
)
*
(
N
+
1
)
+
col
]
+
1
;
int
ins
=
dist
[
row
*
(
N
+
1
)
+
col
-
1
]
+
1
;
int
subs
=
dist
[(
row
-
1
)
*
(
N
+
1
)
+
(
col
-
1
)]
+
cost
;
dist
[
index
]
=
min
(
dels
,
min
(
ins
,
subs
));
}
}
template
<
typename
T
>
__global__
void
SetOutput
(
T
*
out
,
const
T
*
dist
,
const
int
M
,
const
int
N
,
bool
normalized
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
idx
==
0
)
{
out
[
0
]
=
normalized
?
dist
[
M
*
(
N
+
1
)
+
N
]
/
N
:
dist
[
M
*
(
N
+
1
)
+
N
];
}
}
template
<
typename
Place
,
typename
T
>
class
EditDistanceGPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
out_t
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
x1_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Hyps"
);
auto
*
x2_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Refs"
);
auto
normalized
=
ctx
.
Attr
<
bool
>
(
"normalized"
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
auto
hyp_lod
=
x1_t
->
lod
()[
0
];
auto
ref_lod
=
x2_t
->
lod
()[
0
];
PADDLE_ENFORCE
(
hyp_lod
.
size
()
==
ref_lod
.
size
(),
"Input(Hyps) and Input(Refs) must have the same batch size."
);
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
ref_lod
[
i
]
>
ref_lod
[
i
-
1
],
"Reference string %d is empty."
,
i
);
}
auto
num_strs
=
hyp_lod
.
size
()
-
1
;
out_t
->
Resize
({
static_cast
<
int64_t
>
(
num_strs
),
1
});
out_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out
=
out_t
->
data
<
T
>
();
T
distance
=
0.0
;
for
(
size_t
num
=
0
;
num
<
num_strs
;
num
++
)
{
auto
m
=
static_cast
<
int64_t
>
(
hyp_lod
[
num
+
1
]
-
hyp_lod
[
num
]);
auto
n
=
static_cast
<
int64_t
>
(
ref_lod
[
num
+
1
]
-
ref_lod
[
num
]);
if
(
m
==
0
||
n
==
0
)
{
distance
=
std
::
max
(
m
,
n
);
if
(
normalized
)
{
PADDLE_ENFORCE
(
n
>
0
,
"The reference string (#%d) cannot be empty "
"when Attr(normalized) is enabled."
,
n
);
distance
=
distance
/
n
;
}
memory
::
Copy
(
boost
::
get
<
Place
>
(
ctx
.
GetPlace
()),
out
+
num
,
platform
::
CPUPlace
(),
&
distance
,
sizeof
(
T
),
stream
);
}
else
{
framework
::
Tensor
dist_t
;
dist_t
.
Resize
({
m
+
1
,
n
+
1
});
dist_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
dist
=
dist_t
.
data
<
T
>
();
auto
x1
=
x1_t
->
data
<
int
>
()
+
hyp_lod
[
num
];
auto
x2
=
x2_t
->
data
<
int
>
()
+
ref_lod
[
num
];
FillFirstColumn
<
T
><<<
1
+
m
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
dist
,
m
,
n
);
FillFirstRow
<
T
><<<
1
+
n
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
dist
,
n
);
// Compute the elements of distance matrix in the anti-diagonal diretion
for
(
int64_t
slice
=
2
;
slice
<
m
+
n
+
1
;
++
slice
)
{
int
z_m
=
slice
<
m
+
1
?
0
:
slice
-
m
;
int
z_n
=
slice
<
n
+
1
?
0
:
slice
-
n
;
int
size
=
slice
-
(
z_m
+
z_n
)
+
1
;
// number of elments in the same
// anti-diagonal line to update
// the start index at which computes from
int
start
=
slice
<
n
+
1
?
slice
:
(
z_n
+
1
)
*
(
n
+
1
)
-
1
;
Levenshtein
<
T
><<<
1
+
(
size
-
1
)
/
PADDLE_CUDA_NUM_THREADS
,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
dist
,
x1
,
x2
,
m
,
n
,
start
);
}
SetOutput
<
T
><<<
1
,
1
,
0
,
stream
>>>
(
out
+
num
,
dist
,
m
,
n
,
normalized
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
edit_distance
,
ops
::
EditDistanceGPUKernel
<
paddle
::
platform
::
CUDAPlace
,
float
>
);
paddle/operators/edit_distance_op.h
0 → 100644
浏览文件 @
9eb3fb29
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
Place
,
typename
T
>
class
EditDistanceKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
out_t
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
x1_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Hyps"
);
auto
*
x2_t
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Refs"
);
auto
normalized
=
ctx
.
Attr
<
bool
>
(
"normalized"
);
auto
hyp_lod
=
x1_t
->
lod
()[
0
];
auto
ref_lod
=
x2_t
->
lod
()[
0
];
PADDLE_ENFORCE
(
hyp_lod
.
size
()
==
ref_lod
.
size
(),
"Input(Hyps) and Input(Refs) must have the same batch size."
);
for
(
size_t
i
=
1
;
i
<
ref_lod
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
ref_lod
[
i
]
>
ref_lod
[
i
-
1
],
"Reference string %d is empty."
,
i
);
}
auto
num_strs
=
hyp_lod
.
size
()
-
1
;
out_t
->
Resize
({
static_cast
<
int64_t
>
(
num_strs
),
1
});
out_t
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
auto
out
=
out_t
->
data
<
T
>
();
T
distance
=
0.0
;
for
(
size_t
num
=
0
;
num
<
num_strs
;
++
num
)
{
auto
m
=
static_cast
<
int64_t
>
(
hyp_lod
[
num
+
1
]
-
hyp_lod
[
num
]);
auto
n
=
static_cast
<
int64_t
>
(
ref_lod
[
num
+
1
]
-
ref_lod
[
num
]);
if
(
m
==
0
)
{
distance
=
n
;
}
else
if
(
n
==
0
)
{
distance
=
m
;
}
else
{
framework
::
Tensor
dist_t
;
dist_t
.
Resize
({
m
+
1
,
n
+
1
});
dist_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
dist
=
dist_t
.
data
<
T
>
();
auto
x1
=
x1_t
->
data
<
int
>
()
+
hyp_lod
[
num
];
auto
x2
=
x2_t
->
data
<
int
>
()
+
ref_lod
[
num
];
for
(
int64_t
i
=
0
;
i
<
m
+
1
;
++
i
)
{
dist
[
i
*
(
n
+
1
)]
=
i
;
}
for
(
int64_t
j
=
0
;
j
<
n
+
1
;
++
j
)
{
dist
[
j
]
=
j
;
}
for
(
int64_t
i
=
1
;
i
<
m
+
1
;
++
i
)
{
for
(
int64_t
j
=
1
;
j
<
n
+
1
;
++
j
)
{
int
cost
=
x1
[
i
-
1
]
==
x2
[
j
-
1
]
?
0
:
1
;
int
dels
=
dist
[(
i
-
1
)
*
(
n
+
1
)
+
j
]
+
1
;
int
ins
=
dist
[
i
*
(
n
+
1
)
+
(
j
-
1
)]
+
1
;
int
subs
=
dist
[(
i
-
1
)
*
(
n
+
1
)
+
(
j
-
1
)]
+
cost
;
dist
[
i
*
(
n
+
1
)
+
j
]
=
std
::
min
(
dels
,
std
::
min
(
ins
,
subs
));
}
}
distance
=
dist
[
m
*
(
n
+
1
)
+
n
];
}
if
(
normalized
)
{
PADDLE_ENFORCE
(
n
>
0
,
"The reference string (#%d) cannot be empty "
"when Attr(normalized) is enabled."
,
n
);
distance
=
distance
/
n
;
}
out
[
num
]
=
distance
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/operators/get_places_op.cc
浏览文件 @
9eb3fb29
...
...
@@ -39,17 +39,19 @@ class GetPlacesOp : public framework::OperatorBase {
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
std
::
string
device_type
=
Attr
<
std
::
string
>
(
"device_type"
);
bool
is_gpu
;
if
(
Attr
<
std
::
string
>
(
"device_type"
)
==
"AUTO"
)
{
is_gpu
=
platform
::
is_gpu_place
(
place
);
}
else
{
is_gpu
=
Attr
<
std
::
string
>
(
"device_type"
)
==
"CUDA"
;
}
auto
device_count
=
static_cast
<
size_t
>
(
Attr
<
int
>
(
"device_count"
));
if
(
device_count
==
0
)
{
if
(
device_type
==
"CUDA"
)
{
device_count
=
CUDADevCount
();
}
else
if
(
device_type
==
"CPU"
)
{
device_count
=
std
::
thread
::
hardware_concurrency
();
}
device_count
=
is_gpu
?
CUDADevCount
()
:
std
::
thread
::
hardware_concurrency
();
}
PADDLE_ENFORCE_NE
(
device_count
,
0
,
"Cannot indicate %s device count"
,
device_type
);
is_gpu
?
"GPU"
:
"CPU"
);
auto
out_var_name
=
Output
(
"Out"
);
auto
&
places
=
...
...
@@ -57,14 +59,14 @@ class GetPlacesOp : public framework::OperatorBase {
"Output variable %s cannot be found"
,
out_var_name
)
.
GetMutable
<
platform
::
PlaceList
>
());
places
.
reserve
(
device_count
);
if
(
device_type
==
"CUDA"
)
{
if
(
is_gpu
)
{
PADDLE_ENFORCE_LE
(
device_count
,
CUDADevCount
(),
"Only %d CUDA devices found, cannot set to %d"
,
CUDADevCount
(),
device_count
);
for
(
size_t
i
=
0
;
i
<
device_count
;
++
i
)
{
places
.
emplace_back
(
platform
::
CUDAPlace
(
i
));
places
.
emplace_back
(
platform
::
CUDAPlace
(
static_cast
<
int
>
(
i
)
));
}
}
else
if
(
device_type
==
"CPU"
)
{
}
else
{
for
(
size_t
i
=
0
;
i
<
device_count
;
++
i
)
{
places
.
emplace_back
(
platform
::
CPUPlace
());
}
...
...
@@ -77,10 +79,10 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
GetPlacesOpProtoMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddOutput
(
"Out"
,
"vector of Place"
);
AddAttr
<
int
>
(
"device_count"
,
"device count"
).
SetDefault
(
1
);
AddAttr
<
std
::
string
>
(
"device_type"
,
R"(device type must be in ["CPU", "CUDA"])"
)
.
InEnum
({
"CPU"
,
"CUDA"
}
);
AddAttr
<
int
>
(
"device_count"
,
"device count"
).
SetDefault
(
0
);
AddAttr
<
std
::
string
>
(
"device_type"
,
"device type"
)
.
InEnum
({
"CUDA"
,
"CPU"
,
"AUTO"
}
)
.
SetDefault
(
"AUTO"
);
AddComment
(
R"DOC(
Returns a list of places based on flags. The list will be used for parallel
execution.
...
...
@@ -111,4 +113,5 @@ class GetPlacesInferShape : public framework::InferShapeBase {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
get_places
,
ops
::
GetPlacesOp
,
ops
::
GetPlacesOpProtoMaker
,
ops
::
GetPlacesInferVarType
,
ops
::
GetPlacesInferShape
);
ops
::
GetPlacesInferVarType
,
ops
::
GetPlacesInferShape
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/operators/parallel_do_op.cc
浏览文件 @
9eb3fb29
...
...
@@ -39,6 +39,7 @@ void SplitTensorAndMoveTensorToScopes(
const
std
::
vector
<
framework
::
Scope
*>
&
sub_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
names
)
{
PADDLE_ENFORCE_EQ
(
sub_scopes
.
size
(),
places
.
size
());
for
(
auto
&
argu
:
names
)
{
auto
*
var
=
scope
.
FindVar
(
argu
);
const
auto
&
tensor
=
var
->
Get
<
LoDTensor
>
();
...
...
@@ -54,6 +55,15 @@ void SplitTensorAndMoveTensorToScopes(
}
}
void
WaitOnPlaces
(
const
std
::
vector
<
platform
::
Place
>
places
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
for
(
auto
&
place
:
places
)
{
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
dev_ctx
.
Wait
();
}
}
class
ParallelDoOp
:
public
framework
::
OperatorBase
{
public:
ParallelDoOp
(
const
std
::
string
&
type
,
...
...
@@ -71,10 +81,7 @@ class ParallelDoOp : public framework::OperatorBase {
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kParallelBlock
);
auto
*
program
=
block
->
Program
();
// TODO(tonyyang-svail): get places from input
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
places
.
emplace_back
(
platform
::
CPUPlace
());
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
auto
&
sub_scopes
=
*
scope
.
FindVar
(
Output
(
kParallelScopes
))
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
...
...
@@ -82,8 +89,22 @@ class ParallelDoOp : public framework::OperatorBase {
sub_scopes
.
push_back
(
&
scope
.
NewScope
());
}
// split input
SplitTensorAndMoveTensorToScopes
(
scope
,
sub_scopes
,
places
,
Inputs
(
kInputs
));
// copy parameter
for
(
auto
&
param
:
Inputs
(
kParameters
))
{
PADDLE_ENFORCE
(
scope
.
FindVar
(
param
)
->
IsType
<
LoDTensor
>
(),
"Only support parameter type as LoDTensor"
);
auto
&
src
=
scope
.
FindVar
(
param
)
->
Get
<
LoDTensor
>
();
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
auto
&
place
=
places
[
i
];
auto
*
sub_scope
=
sub_scopes
[
i
];
auto
*
dst
=
sub_scope
->
Var
(
param
)
->
GetMutable
<
LoDTensor
>
();
framework
::
Copy
(
src
,
place
,
dst
);
}
}
WaitOnPlaces
(
places
);
std
::
vector
<
std
::
future
<
void
>>
workers
;
workers
.
reserve
(
places
.
size
());
...
...
@@ -93,12 +114,6 @@ class ParallelDoOp : public framework::OperatorBase {
auto
&
place
=
places
[
place_idx
];
auto
*
cur_scope
=
sub_scopes
[
place_idx
];
// copy parameter
// some version of boost lacks != for boost::variant
if
(
!
(
dev_ctx
.
GetPlace
()
==
place
))
{
PADDLE_THROW
(
"Not Implemented"
);
}
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
...
...
@@ -108,6 +123,7 @@ class ParallelDoOp : public framework::OperatorBase {
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// merge output
for
(
auto
&
o_name
:
Outputs
(
kOutputs
))
{
...
...
@@ -121,6 +137,7 @@ class ParallelDoOp : public framework::OperatorBase {
scope
.
FindVar
(
o_name
)
->
GetMutable
<
LoDTensor
>
();
lod_tensor_to_be_merged
->
MergeLoDTensor
(
lod_tensors
,
dev_ctx
.
GetPlace
());
}
WaitOnPlaces
(
places
);
}
};
...
...
@@ -161,15 +178,14 @@ class ParallelDoGradOp : public OperatorBase {
auto
&
sub_scopes
=
scope
.
FindVar
(
Input
(
kParallelScopes
))
->
Get
<
std
::
vector
<
framework
::
Scope
*>>
();
// TODO(tonyyang-svail): get places from input
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
places
.
emplace_back
(
platform
::
CPUPlace
());
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
// feed output@grad
SplitTensorAndMoveTensorToScopes
(
scope
,
sub_scopes
,
places
,
Inputs
(
framework
::
GradVarName
(
kOutputs
)));
WaitOnPlaces
(
places
);
// for debugging
for
(
auto
&
s
:
Inputs
(
framework
::
GradVarName
(
kOutputs
)))
{
VLOG
(
3
)
<<
s
;
VLOG
(
3
)
<<
scope
.
FindVar
(
s
)
->
Get
<
LoDTensor
>
();
...
...
@@ -196,10 +212,11 @@ class ParallelDoGradOp : public OperatorBase {
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// merge grad
for
(
auto
&
s
:
Outputs
(
framework
::
GradVarName
(
kParameters
)))
{
VLOG
(
3
)
<<
s
;
VLOG
(
3
)
<<
"merge grad "
<<
s
;
auto
&
t
=
sub_scopes
[
0
]
->
FindVar
(
s
)
->
Get
<
LoDTensor
>
();
VLOG
(
3
)
<<
t
;
...
...
@@ -216,7 +233,8 @@ class ParallelDoGradOp : public OperatorBase {
auto
sum_op
=
framework
::
OpRegistry
::
CreateOp
(
"sum"
,
{{
"X"
,
{
s
,
s_buf
}}},
{{
"Out"
,
{
s
}}},
framework
::
AttributeMap
{});
sum_op
->
Run
(
*
sub_scopes
[
0
],
place
);
sum_op
->
Run
(
*
sub_scopes
[
0
],
places
[
0
]);
WaitOnPlaces
(
places
);
}
VLOG
(
3
)
<<
t
;
...
...
@@ -236,8 +254,10 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
for
(
auto
&
input_param
:
this
->
InputNames
())
{
VLOG
(
3
)
<<
input_param
;
grad
->
SetInput
(
input_param
,
this
->
Input
(
input_param
));
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
this
->
InputGrad
(
input_param
,
false
));
if
(
input_param
!=
kPlaces
)
{
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
this
->
InputGrad
(
input_param
,
false
));
}
}
for
(
auto
&
output_param
:
this
->
OutputNames
())
{
...
...
paddle/operators/shrink_rnn_memory_op.cc
浏览文件 @
9eb3fb29
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/operators/array_operator.h"
#include "paddle/operators/math/math_function.h"
...
...
@@ -46,8 +47,21 @@ class ShrinkRNNMemoryOp : public ArrayOp {
auto
*
out_var
=
scope
.
FindVar
(
Output
(
"Out"
));
PADDLE_ENFORCE
(
out_var
!=
nullptr
,
"Output Out must be set"
);
auto
&
out_tensor
=
*
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
size_t
height
=
dst_num_rows
;
// do shrink for the top level LoD
if
(
x_tensor
.
lod
().
size
()
>
0
&&
x_tensor
.
lod
()[
0
].
size
()
>
static_cast
<
size_t
>
(
dst_num_rows
))
{
auto
lod_offset
=
framework
::
GetSubLoDAndAbsoluteOffset
(
x_tensor
.
lod
(),
0
,
dst_num_rows
,
0
);
height
=
lod_offset
.
second
.
second
;
auto
out_lod
=
out_tensor
.
mutable_lod
();
framework
::
AppendLoD
(
out_lod
,
lod_offset
.
first
);
}
if
(
dst_num_rows
!=
0
)
{
out_tensor
.
ShareDataWith
(
x_tensor
.
Slice
(
0
,
dst_num_rows
));
out_tensor
.
ShareDataWith
(
x_tensor
.
Slice
(
0
,
height
));
}
}
};
...
...
@@ -64,11 +78,11 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(LoDTensor) The shrinked RNN step memory."
);
AddComment
(
R"DOC(
In dynamic RNN, we are able to handle sequences of different lengths.
Because of the multiple lengths, the size of each step input can be
In dynamic RNN, we are able to handle sequences of different lengths.
Because of the multiple lengths, the size of each step input can be
different, which may lead to a mismatching between the input of
the current step and the memory generated by the previous one. This
operator shrinks memory according to the size of the next step input,
the current step and the memory generated by the previous one. This
operator shrinks memory according to the size of the next step input,
to make sure that they can match each other.
)DOC"
);
}
...
...
@@ -132,6 +146,7 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
PADDLE_ENFORCE
(
context
->
HasOutput
(
framework
::
GradVarName
(
"X"
)));
context
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
context
->
GetInputDim
(
"X"
));
context
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
}
};
...
...
paddle/scripts/submit_local.sh.in
浏览文件 @
9eb3fb29
...
...
@@ -92,6 +92,9 @@ function threads_config() {
if
[
-z
"
$OPENBLAS_NUM_THREADS
"
]
;
then
export
OPENBLAS_NUM_THREADS
=
$threads
fi
if
[
$threads
-gt
1
]
&&
[
-z
"
$OPENBLAS_MAIN_FREE
"
]
;
then
export
OPENBLAS_MAIN_FREE
=
1
fi
fi
}
...
...
python/paddle/v2/__init__.py
浏览文件 @
9eb3fb29
...
...
@@ -62,12 +62,15 @@ __all__ = [
cp
.
begin_parse
()
def
set_
omp_mkl_
env_vars
(
trainer_count
):
def
set_env_vars
(
trainer_count
):
'''Auto set CPU environment if have not set before.
export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
For MKL:
export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
For OpenBLAS:
export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count.
'''
import
platform
import
platform
,
paddle
if
not
platform
.
system
()
in
[
'Linux'
,
'Darwin'
]:
return
...
...
@@ -103,16 +106,22 @@ def set_omp_mkl_env_vars(trainer_count):
num_cores
=
num_physical_cores
()
num_processors
=
num_logical_processors
()
if
num_processors
>
num_cores
:
# Hyper Threading is enabled
set_env
(
"OMP_DYNAMIC"
,
"true"
)
set_env
(
"KMP_AFFINITY"
,
"granularity=fine,compact,1,0"
)
else
:
set_env
(
"OMP_DYNAMIC"
,
"false"
)
set_env
(
"KMP_AFFINITY"
,
"granularity=fine,compact,0,0"
)
if
paddle
.
version
.
mkl
()
==
'ON'
:
if
num_processors
>
num_cores
:
# Hyper Threading is enabled
set_env
(
"OMP_DYNAMIC"
,
"true"
)
set_env
(
"KMP_AFFINITY"
,
"granularity=fine,compact,1,0"
)
else
:
set_env
(
"OMP_DYNAMIC"
,
"false"
)
set_env
(
"KMP_AFFINITY"
,
"granularity=fine,compact,0,0"
)
threads
=
num_processors
/
trainer_count
threads
=
'1'
if
threads
<
1
else
str
(
threads
)
set_env
(
"OMP_NUM_THREADS"
,
threads
)
set_env
(
"MKL_NUM_THREADS"
,
threads
)
if
paddle
.
version
.
mkl
()
==
'ON'
:
set_env
(
"OMP_NUM_THREADS"
,
threads
)
set_env
(
"MKL_NUM_THREADS"
,
threads
)
else
:
set_env
(
"OPENBLAS_NUM_THREADS"
,
threads
)
if
threads
>
1
:
set_env
(
"OPENBLAS_MAIN_FREE"
,
'1'
)
def
init
(
**
kwargs
):
...
...
@@ -129,7 +138,7 @@ def init(**kwargs):
for
key
in
args_dict
.
keys
():
args
.
append
(
'--%s=%s'
%
(
key
,
str
(
args_dict
[
key
])))
set_
omp_mkl_
env_vars
(
kwargs
.
get
(
'trainer_count'
,
1
))
set_env_vars
(
kwargs
.
get
(
'trainer_count'
,
1
))
if
'use_gpu'
in
kwargs
:
cp
.
g_command_config_args
[
'use_gpu'
]
=
kwargs
[
'use_gpu'
]
...
...
python/paddle/v2/fluid/layers/device.py
浏览文件 @
9eb3fb29
...
...
@@ -4,19 +4,22 @@ All util layers.
from
..layer_helper
import
LayerHelper
from
..framework
import
unique_name
from
..registry
import
autodoc
__all__
=
[
'get_places'
]
def
get_places
(
device_count
=
0
,
device_type
=
"CPU"
):
@
autodoc
def
get_places
(
device_count
=
None
,
device_type
=
None
):
helper
=
LayerHelper
(
'get_places'
,
**
locals
())
out_places
=
helper
.
create_variable
(
name
=
unique_name
(
helper
.
name
+
".out"
))
attrs
=
dict
()
if
device_count
is
not
None
:
attrs
[
'device_count'
]
=
int
(
device_count
)
if
device_type
is
not
None
:
attrs
[
'device_type'
]
=
str
(
device_type
)
helper
.
append_op
(
type
=
'get_places'
,
outputs
=
{
"Out"
:
[
out_places
]},
attrs
=
{
"device_type"
:
device_type
,
'device_count'
:
device_count
,
})
type
=
'get_places'
,
outputs
=
{
"Out"
:
[
out_places
]},
attrs
=
attrs
)
return
out_places
python/paddle/v2/fluid/tests/test_edit_distance_op.py
0 → 100644
浏览文件 @
9eb3fb29
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
Levenshtein
(
hyp
,
ref
):
""" Compute the Levenshtein distance between two strings.
:param hyp: hypothesis string in index
:type hyp: list
:param ref: reference string in index
:type ref: list
"""
m
=
len
(
hyp
)
n
=
len
(
ref
)
if
m
==
0
:
return
n
if
n
==
0
:
return
m
dist
=
np
.
zeros
((
m
+
1
,
n
+
1
)).
astype
(
"float32"
)
for
i
in
range
(
0
,
m
+
1
):
dist
[
i
][
0
]
=
i
for
j
in
range
(
0
,
n
+
1
):
dist
[
0
][
j
]
=
j
for
i
in
range
(
1
,
m
+
1
):
for
j
in
range
(
1
,
n
+
1
):
cost
=
0
if
hyp
[
i
-
1
]
==
ref
[
j
-
1
]
else
1
deletion
=
dist
[
i
-
1
][
j
]
+
1
insertion
=
dist
[
i
][
j
-
1
]
+
1
substitution
=
dist
[
i
-
1
][
j
-
1
]
+
cost
dist
[
i
][
j
]
=
min
(
deletion
,
insertion
,
substitution
)
return
dist
[
m
][
n
]
class
TestEditDistanceOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"edit_distance"
normalized
=
False
x1
=
np
.
array
([[
0
,
12
,
3
,
5
,
8
,
2
]]).
astype
(
"int32"
)
x2
=
np
.
array
([[
0
,
12
,
4
,
7
,
8
]]).
astype
(
"int32"
)
x1
=
np
.
transpose
(
x1
)
x2
=
np
.
transpose
(
x2
)
x1_lod
=
[
0
,
1
,
5
]
x2_lod
=
[
0
,
3
,
4
]
num_strs
=
len
(
x1_lod
)
-
1
distance
=
np
.
zeros
((
num_strs
,
1
)).
astype
(
"float32"
)
for
i
in
range
(
0
,
num_strs
):
distance
[
i
]
=
Levenshtein
(
hyp
=
x1
[
x1_lod
[
i
]:
x1_lod
[
i
+
1
]],
ref
=
x2
[
x2_lod
[
i
]:
x2_lod
[
i
+
1
]])
if
normalized
is
True
:
len_ref
=
x2_lod
[
i
+
1
]
-
x2_lod
[
i
]
distance
[
i
]
=
distance
[
i
]
/
len_ref
self
.
attrs
=
{
'normalized'
:
normalized
}
self
.
inputs
=
{
'Hyps'
:
(
x1
,
[
x1_lod
]),
'Refs'
:
(
x2
,
[
x2_lod
])}
self
.
outputs
=
{
'Out'
:
distance
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestEditDistanceOpNormalized
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"edit_distance"
normalized
=
True
x1
=
np
.
array
([[
0
,
10
,
3
,
6
,
5
,
8
,
2
]]).
astype
(
"int32"
)
x2
=
np
.
array
([[
0
,
10
,
4
,
6
,
7
,
8
]]).
astype
(
"int32"
)
x1
=
np
.
transpose
(
x1
)
x2
=
np
.
transpose
(
x2
)
x1_lod
=
[
0
,
1
,
3
,
6
]
x2_lod
=
[
0
,
2
,
3
,
5
]
num_strs
=
len
(
x1_lod
)
-
1
distance
=
np
.
zeros
((
num_strs
,
1
)).
astype
(
"float32"
)
for
i
in
range
(
0
,
num_strs
):
distance
[
i
]
=
Levenshtein
(
hyp
=
x1
[
x1_lod
[
i
]:
x1_lod
[
i
+
1
]],
ref
=
x2
[
x2_lod
[
i
]:
x2_lod
[
i
+
1
]])
if
normalized
is
True
:
len_ref
=
x2_lod
[
i
+
1
]
-
x2_lod
[
i
]
distance
[
i
]
=
distance
[
i
]
/
len_ref
self
.
attrs
=
{
'normalized'
:
normalized
}
self
.
inputs
=
{
'Hyps'
:
(
x1
,
[
x1_lod
]),
'Refs'
:
(
x2
,
[
x2_lod
])}
self
.
outputs
=
{
'Out'
:
distance
}
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/v2/fluid/tests/test_parallel_op.py
浏览文件 @
9eb3fb29
...
...
@@ -18,7 +18,7 @@ class ParallelOpTest(unittest.TestCase):
append_batch_size
=
False
,
stop_gradient
=
False
)
places
=
fluid
.
default_main_program
().
global_block
().
create_var
(
)
places
=
layers
.
get_places
(
device_count
=
4
)
pd
=
layers
.
ParallelDo
(
places
=
places
)
with
pd
.
do
():
...
...
python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
浏览文件 @
9eb3fb29
...
...
@@ -3,43 +3,86 @@ import paddle.v2.fluid.core as core
from
paddle.v2.fluid.executor
import
Executor
import
paddle.v2.fluid.layers
as
layers
from
paddle.v2.fluid.backward
import
append_backward
from
paddle.v2.fluid.framework
import
default_main_program
import
numpy
from
paddle.v2.fluid.framework
import
default_main_program
,
switch_main_program
from
paddle.v2.fluid.framework
import
Program
import
numpy
as
np
main_program
=
default_main_program
()
class
TestShrinkRNNMemory
(
unittest
.
TestCase
):
def
test_shrink_rnn_memory
(
self
):
class
TestShrinkRNNMemoryBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
main_program
=
Program
()
switch_main_program
(
self
.
main_program
)
x
=
layers
.
data
(
'x'
,
shape
=
[
100
],
dtype
=
'float32'
)
x
.
stop_gradient
=
False
table
=
layers
.
lod_rank_table
(
x
=
x
)
rank_table_tensor
=
layers
.
data
(
'rank_table_tensor'
,
shape
=
[
1
],
dtype
=
'float32'
,
lod_level
=
1
)
table
=
layers
.
lod_rank_table
(
x
=
rank_table_tensor
)
i
=
layers
.
zeros
(
dtype
=
'int64'
,
shape
=
[
1
])
mem1
=
layers
.
shrink_memory
(
x
=
x
,
i
=
i
,
table
=
table
)
self
.
mem1
=
layers
.
shrink_memory
(
x
=
x
,
i
=
i
,
table
=
table
)
i
=
layers
.
increment
(
x
=
i
)
i
.
stop_gradient
=
True
mem2
=
layers
.
shrink_memory
(
x
=
mem1
,
i
=
i
,
table
=
table
)
self
.
mem2
=
layers
.
shrink_memory
(
x
=
self
.
mem1
,
i
=
i
,
table
=
table
)
i
=
layers
.
increment
(
x
=
i
)
i
.
stop_gradient
=
True
mem3
=
layers
.
shrink_memory
(
x
=
mem2
,
i
=
i
,
table
=
table
)
self
.
mem3
=
layers
.
shrink_memory
(
x
=
self
.
mem2
,
i
=
i
,
table
=
table
)
mem3_mean
=
layers
.
mean
(
x
=
self
.
mem3
)
append_backward
(
loss
=
mem3_mean
)
self
.
x_grad
=
self
.
main_program
.
global_block
().
var
(
'x@GRAD'
)
def
sum_lodtensor
(
self
,
tensor
):
sum_res
=
0.0
for
i
in
xrange
(
np
.
product
(
tensor
.
get_dims
())):
sum_res
+=
tensor
.
get_float_element
(
i
)
return
sum_res
class
TestShrinkRNNMemoryReferLoD
(
TestShrinkRNNMemoryBase
):
def
test_refer_lod
(
self
):
cpu
=
core
.
CPUPlace
()
tensor
=
core
.
LoDTensor
()
tensor
.
set_lod
([[
0
,
2
,
5
,
6
]])
tensor_np
=
numpy
.
random
.
random
(
size
=
(
3
,
100
)).
astype
(
'float32'
)
tensor
.
set
(
tensor_np
,
cpu
)
x_tensor
=
core
.
LoDTensor
()
x_tensor
.
set_lod
([[
0
,
2
,
5
,
6
]])
tensor_np
=
np
.
random
.
random
(
size
=
(
6
,
100
)).
astype
(
'float32'
)
x_tensor
.
set
(
tensor_np
,
cpu
)
rank_table_tensor
=
core
.
LoDTensor
()
rank_table_tensor
.
set_lod
([[
0
,
1
,
3
,
6
]])
rank_table_tensor
.
set
(
np
.
random
.
random
(
size
=
(
6
,
1
)).
astype
(
'float32'
),
cpu
)
exe
=
Executor
(
cpu
)
outs
=
exe
.
run
(
feed
=
{
'x'
:
tensor
},
fetch_list
=
[
mem1
,
mem2
,
mem3
])
self
.
assertTrue
(
numpy
.
allclose
(
tensor_np
[
0
:
3
],
outs
[
0
]))
self
.
assertTrue
(
numpy
.
allclose
(
tensor_np
[
0
:
2
],
outs
[
1
]))
self
.
assertTrue
(
numpy
.
allclose
(
tensor_np
[
0
:
1
],
outs
[
2
]))
outs
=
exe
.
run
(
feed
=
{
'x'
:
x_tensor
,
'rank_table_tensor'
:
rank_table_tensor
},
fetch_list
=
[
self
.
mem1
,
self
.
mem2
,
self
.
mem3
,
self
.
x_grad
],
return_numpy
=
False
)
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
6
],
outs
[
0
]))
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
5
],
outs
[
1
]))
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
2
],
outs
[
2
]))
self
.
assertAlmostEqual
(
1.0
,
self
.
sum_lodtensor
(
outs
[
3
]),
delta
=
0.01
)
mem3_mean
=
layers
.
mean
(
x
=
mem3
)
append_backward
(
loss
=
mem3_mean
)
x_grad
=
exe
.
run
(
feed
=
{
'x'
:
tensor
},
fetch_list
=
[
main_program
.
global_block
().
var
(
'x@GRAD'
)])[
0
]
self
.
assertAlmostEqual
(
1.0
,
x_grad
.
sum
(),
delta
=
0.1
)
class
TestShrinkRNNMemoryNoLoD
(
TestShrinkRNNMemoryBase
):
def
test_no_lod
(
self
):
cpu
=
core
.
CPUPlace
()
x_tensor
=
core
.
LoDTensor
()
tensor_np
=
np
.
random
.
random
(
size
=
(
3
,
100
)).
astype
(
'float32'
)
x_tensor
.
set
(
tensor_np
,
cpu
)
rank_table_tensor
=
core
.
LoDTensor
()
rank_table_tensor
.
set_lod
([[
0
,
1
,
3
,
6
]])
rank_table_tensor
.
set
(
np
.
random
.
random
(
size
=
(
6
,
1
)).
astype
(
'float32'
),
cpu
)
exe
=
Executor
(
cpu
)
outs
=
exe
.
run
(
feed
=
{
'x'
:
x_tensor
,
'rank_table_tensor'
:
rank_table_tensor
},
fetch_list
=
[
self
.
mem1
,
self
.
mem2
,
self
.
mem3
,
self
.
x_grad
],
return_numpy
=
False
)
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
3
],
outs
[
0
]))
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
2
],
outs
[
1
]))
self
.
assertTrue
(
np
.
allclose
(
tensor_np
[
0
:
1
],
outs
[
2
]))
self
.
assertAlmostEqual
(
1.0
,
self
.
sum_lodtensor
(
outs
[
3
]),
delta
=
0.01
)
if
__name__
==
'__main__'
:
...
...
python/setup.py.in
浏览文件 @
9eb3fb29
...
...
@@ -31,6 +31,7 @@ patch = '%(patch)d'
rc = '%(rc)d'
istaged = %(istaged)s
commit = '%(commit)s'
with_mkl = '%(with_mkl)s'
def show():
if istaged:
...
...
@@ -41,6 +42,9 @@ def show():
print 'rc:', rc
else:
print 'commit:', commit
def mkl():
return with_mkl
'''
commit = git_commit()
with open(filename, 'w') as f:
...
...
@@ -51,7 +55,8 @@ def show():
'rc': RC,
'version': '${PADDLE_VERSION}',
'commit': commit,
'istaged': ISTAGED})
'istaged': ISTAGED,
'with_mkl': '@WITH_MKL@'})
write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录