Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
330fa95c
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
330fa95c
编写于
4月 27, 2018
作者:
F
fengjiayi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Follow comments
上级
bfe08446
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
93 addition
and
76 deletion
+93
-76
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+2
-2
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+1
-2
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+2
-4
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+39
-10
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+3
-2
paddle/fluid/operators/fetch_op.cc
paddle/fluid/operators/fetch_op.cc
+1
-3
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+7
-9
paddle/fluid/operators/math/math_function_test.cu
paddle/fluid/operators/math/math_function_test.cu
+31
-31
paddle/fluid/operators/math/vol2col_test.cc
paddle/fluid/operators/math/vol2col_test.cc
+4
-5
paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
.../fluid/operators/reader/create_double_buffer_reader_op.cc
+1
-2
paddle/fluid/operators/reshape_op.h
paddle/fluid/operators/reshape_op.h
+2
-6
未找到文件。
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
330fa95c
...
...
@@ -139,7 +139,7 @@ struct TestBroadcastOpHandle {
PADDLE_ENFORCE_EQ
(
out_tensor
.
lod
(),
lod
,
"lod is not equal."
);
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
out_tensor
,
cpu_place
,
*
(
ctxs_
[
j
]),
&
result_tensor
,
true
);
f
::
TensorCopy
Sync
(
out_tensor
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
mutable_data
<
float
>
(
cpu_place
);
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
...
...
@@ -185,7 +185,7 @@ struct TestBroadcastOpHandle {
}
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
j
]),
&
result_tensor
,
true
);
f
::
TensorCopy
Sync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
330fa95c
...
...
@@ -66,8 +66,7 @@ void FetchOpHandle::RunImpl() {
auto
&
t
=
var
->
Get
<
framework
::
LoDTensor
>
();
if
(
platform
::
is_gpu_place
(
t
.
place
()))
{
#ifdef PADDLE_WITH_CUDA
TensorCopy
(
t
,
cpu
,
*
dev_ctxes_
[
t
.
place
()],
&
tensors_
[
i
],
true
);
dev_ctxes_
.
at
(
t
.
place
())
->
Wait
();
TensorCopySync
(
t
,
cpu
,
&
tensors_
[
i
]);
#endif
}
else
{
tensors_
[
i
].
ShareDataWith
(
t
);
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
330fa95c
...
...
@@ -194,8 +194,7 @@ struct TestReduceOpHandle {
}
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
output_scope_idx
]),
&
result_tensor
,
true
);
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
result_tensor
.
dims
());
++
j
)
{
...
...
@@ -240,8 +239,7 @@ struct TestReduceOpHandle {
auto
&
rt
=
out_var
->
Get
<
f
::
LoDTensor
>
();
f
::
Tensor
result_tensor
;
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
output_scope_idx
]),
&
result_tensor
,
true
);
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
result_tensor
.
dims
());
++
j
)
{
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
330fa95c
...
...
@@ -20,7 +20,7 @@ namespace paddle {
namespace
framework
{
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
,
bool
sync
)
{
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
...
...
@@ -48,9 +48,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
src_gpu_place
,
ctx_gpu_place
);
auto
stream
=
sync
?
nullptr
:
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
)
.
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
...
...
@@ -61,9 +59,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
dst_gpu_place
,
ctx_gpu_place
);
auto
stream
=
sync
?
nullptr
:
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
)
.
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
...
...
@@ -72,9 +68,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
));
auto
stream
=
sync
?
nullptr
:
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
)
.
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
#endif
...
...
@@ -92,6 +86,41 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
TensorCopy
(
src
,
dst_place
,
*
dev_ctx
,
dst
);
}
void
TensorCopySync
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
#endif
}
template
<
typename
Predicate
,
typename
DevCtx
>
struct
AnyDTypeVisitor
{
Predicate
predicate_
;
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
330fa95c
...
...
@@ -24,10 +24,11 @@ namespace paddle {
namespace
framework
{
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
,
bool
sync
=
false
);
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
);
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
);
void
TensorCopySync
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
);
template
<
typename
T
>
void
TensorFromVector
(
const
std
::
vector
<
T
>&
src
,
...
...
paddle/fluid/operators/fetch_op.cc
浏览文件 @
330fa95c
...
...
@@ -57,9 +57,7 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
auto
&
dev_ctx
=
*
pool
.
Get
(
src_item
.
place
());
TensorCopy
(
src_item
,
platform
::
CPUPlace
(),
dev_ctx
,
&
dst_item
,
true
);
TensorCopySync
(
src_item
,
platform
::
CPUPlace
(),
&
dst_item
);
dst_item
.
set_lod
(
src_item
.
lod
());
VLOG
(
3
)
<<
"Fetch variable "
<<
fetch_var_name
<<
" to "
<<
out_name
;
...
...
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
330fa95c
...
...
@@ -62,7 +62,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
TensorCopy
(
input_tmp
,
*
place
,
*
context
,
&
input
,
true
);
TensorCopy
Sync
(
input_tmp
,
*
place
,
&
input
);
}
output_cfo
.
mutable_data
<
float
>
(
{
1
,
filter_size
,
filter_size
,
output_height
,
output_width
},
*
place
);
...
...
@@ -87,8 +87,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_cfo_ptr
=
output_cfo
.
data
<
float
>
();
}
else
{
TensorCopy
(
output_cfo
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
output_tmp
,
true
);
TensorCopySync
(
output_cfo
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_cfo_ptr
=
output_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
...
...
@@ -99,8 +98,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_ocf_ptr
=
output_ocf
.
data
<
float
>
();
}
else
{
TensorCopy
(
output_ocf
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
output_tmp
,
true
);
TensorCopySync
(
output_ocf
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_ocf_ptr
=
output_tmp
.
data
<
float
>
();
}
...
...
@@ -121,7 +119,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
TensorCopy
(
input_tmp
,
*
place
,
*
context
,
&
input
,
true
);
TensorCopy
Sync
(
input_tmp
,
*
place
,
&
input
);
}
col2im
(
*
context
,
output_cfo
,
dilation
,
stride
,
padding
,
&
input
);
...
...
@@ -130,7 +128,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
TensorCopy
(
input
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
input_tmp
,
true
);
TensorCopy
Sync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
...
...
@@ -142,7 +140,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
TensorCopy
(
input_tmp
,
*
place
,
*
context
,
&
input
,
true
);
TensorCopy
Sync
(
input_tmp
,
*
place
,
&
input
);
}
col2im_ocf
(
*
context
,
output_ocf
,
dilation
,
stride
,
padding
,
&
input
);
...
...
@@ -150,7 +148,7 @@ void testIm2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
TensorCopy
(
input
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
input_tmp
,
true
);
TensorCopy
Sync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
...
...
paddle/fluid/operators/math/math_function_test.cu
浏览文件 @
330fa95c
...
...
@@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) {
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
memcpy
(
input1_ptr
,
arr
,
6
*
sizeof
(
float
));
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float
>
({
2
,
2
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
CUDADeviceContext
,
float
>
(
context
,
input1_gpu
,
false
,
input2_gpu
,
true
,
1
,
&
out_gpu
,
0
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
,
true
);
TensorCopy
Sync
(
out_gpu
,
cpu_place
,
&
out
);
float
*
out_ptr
=
out
.
data
<
float
>
();
context
.
Wait
();
...
...
@@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) {
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float16
>
({
2
,
2
},
gpu_place
);
...
...
@@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) {
context
,
input1_gpu
,
false
,
input2_gpu
,
true
,
float16
(
1
),
&
out_gpu
,
float16
(
0
));
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
,
true
);
TensorCopy
Sync
(
out_gpu
,
cpu_place
,
&
out
);
float16
*
out_ptr
=
out
.
data
<
float16
>
();
context
.
Wait
();
...
...
@@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) {
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
memcpy
(
input1_ptr
,
arr
,
6
*
sizeof
(
float
));
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float
>
({
3
,
3
},
gpu_place
);
paddle
::
operators
::
math
::
matmul
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
(
context
,
input1_gpu
,
true
,
input2_gpu
,
false
,
1
,
&
out_gpu
,
0
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
,
true
);
TensorCopy
Sync
(
out_gpu
,
cpu_place
,
&
out
);
float
*
out_ptr
=
out
.
data
<
float
>
();
context
.
Wait
();
...
...
@@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) {
float16
*
input1_ptr
=
input1
.
mutable_data
<
float16
>
({
2
,
3
},
cpu_place
);
fill_fp16_data
(
input1_ptr
,
input1
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
});
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float16
>
({
3
,
3
},
gpu_place
);
...
...
@@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) {
context
,
input1_gpu
,
true
,
input2_gpu
,
false
,
float16
(
1
),
&
out_gpu
,
float16
(
0
));
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
,
true
);
TensorCopy
Sync
(
out_gpu
,
cpu_place
,
&
out
);
float16
*
out_ptr
=
out
.
data
<
float16
>
();
context
.
Wait
();
...
...
@@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float
arr3
[
8
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
memcpy
(
input3_ptr
,
arr3
,
8
*
sizeof
(
float
));
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input2
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
(
input3
,
gpu_place
,
context
,
&
input3_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input2
,
gpu_place
,
&
input2_gpu
);
TensorCopy
Sync
(
input3
,
gpu_place
,
&
input3_gpu
);
float
*
a
=
input1_gpu
.
data
<
float
>
();
float
*
b
=
input2_gpu
.
data
<
float
>
();
float
*
c
=
input3_gpu
.
mutable_data
<
float
>
(
gpu_place
);
...
...
@@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle
::
operators
::
math
::
gemm
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
(
context
,
false
,
false
,
m
,
n
,
k
,
1
,
a
,
3
,
b
+
1
,
4
,
1
,
c
+
1
,
4
);
TensorCopy
(
input3_gpu
,
cpu_place
,
context
,
&
input3
,
true
);
TensorCopy
Sync
(
input3_gpu
,
cpu_place
,
&
input3
);
// numpy code:
// a = np.arange(6).reshape(2, 3)
...
...
@@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
float16
*
input3_ptr
=
input3
.
mutable_data
<
float16
>
({
2
,
4
},
cpu_place
);
fill_fp16_data
(
input3_ptr
,
input3
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
});
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input2
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
(
input3
,
gpu_place
,
context
,
&
input3_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input2
,
gpu_place
,
&
input2_gpu
);
TensorCopy
Sync
(
input3
,
gpu_place
,
&
input3_gpu
);
float16
*
a
=
input1_gpu
.
data
<
float16
>
();
float16
*
b
=
input2_gpu
.
data
<
float16
>
();
float16
*
c
=
input3_gpu
.
mutable_data
<
float16
>
(
gpu_place
);
...
...
@@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
context
,
false
,
false
,
m
,
n
,
k
,
float16
(
1
),
a
,
3
,
b
+
1
,
4
,
float16
(
1
),
c
+
1
,
4
);
TensorCopy
(
input3_gpu
,
cpu_place
,
context
,
&
input3
,
true
);
TensorCopy
Sync
(
input3_gpu
,
cpu_place
,
&
input3
);
// numpy code:
// a = np.arange(6).reshape(2, 3)
...
...
@@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float
arr3
[
8
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
memcpy
(
input3_ptr
,
arr3
,
8
*
sizeof
(
float
));
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input2
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
(
input3
,
gpu_place
,
context
,
&
input3_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input2
,
gpu_place
,
&
input2_gpu
);
TensorCopy
Sync
(
input3
,
gpu_place
,
&
input3_gpu
);
float
*
a
=
input1_gpu
.
data
<
float
>
();
float
*
b
=
input2_gpu
.
data
<
float
>
();
float
*
c
=
input3_gpu
.
mutable_data
<
float
>
(
gpu_place
);
...
...
@@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle
::
operators
::
math
::
gemm
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
(
context
,
false
,
true
,
m
,
n
,
k
,
1
,
a
,
3
,
b
+
3
,
3
,
1
,
c
+
1
,
4
);
TensorCopy
(
input3_gpu
,
cpu_place
,
context
,
&
input3
,
true
);
TensorCopy
Sync
(
input3_gpu
,
cpu_place
,
&
input3
);
context
.
Wait
();
EXPECT_EQ
(
input3_ptr
[
0
],
0
);
...
...
@@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
float16
*
input3_ptr
=
input3
.
mutable_data
<
float16
>
({
2
,
4
},
cpu_place
);
fill_fp16_data
(
input3_ptr
,
input3
.
numel
(),
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
});
TensorCopy
(
input1
,
gpu_place
,
context
,
&
input1_gpu
,
true
);
TensorCopy
(
input2
,
gpu_place
,
context
,
&
input2_gpu
,
true
);
TensorCopy
(
input3
,
gpu_place
,
context
,
&
input3_gpu
,
true
);
TensorCopy
Sync
(
input1
,
gpu_place
,
&
input1_gpu
);
TensorCopy
Sync
(
input2
,
gpu_place
,
&
input2_gpu
);
TensorCopy
Sync
(
input3
,
gpu_place
,
&
input3_gpu
);
float16
*
a
=
input1_gpu
.
data
<
float16
>
();
float16
*
b
=
input2_gpu
.
data
<
float16
>
();
float16
*
c
=
input3_gpu
.
mutable_data
<
float16
>
(
gpu_place
);
...
...
@@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
context
,
false
,
true
,
m
,
n
,
k
,
float16
(
1
),
a
,
3
,
b
+
3
,
3
,
float16
(
1
),
c
+
1
,
4
);
TensorCopy
(
input3_gpu
,
cpu_place
,
context
,
&
input3
,
true
);
TensorCopy
Sync
(
input3_gpu
,
cpu_place
,
&
input3
);
context
.
Wait
();
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
0
]),
0
);
...
...
@@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) {
data_b
[
i
]
=
static_cast
<
T
>
(
i
);
}
TensorCopy
(
mat_a
,
gpu_place
,
context
,
&
g_mat_a
,
true
);
TensorCopy
(
vec_b
,
gpu_place
,
context
,
&
g_vec_b
,
true
);
TensorCopy
Sync
(
mat_a
,
gpu_place
,
&
g_mat_a
);
TensorCopy
Sync
(
vec_b
,
gpu_place
,
&
g_vec_b
);
paddle
::
operators
::
math
::
gemv
<
CUDADeviceContext
,
T
>
(
context
,
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
TensorCopy
(
g_vec_c
,
cpu_place
,
context
,
&
vec_c
,
true
);
TensorCopy
Sync
(
g_vec_c
,
cpu_place
,
&
vec_c
);
if
(
!
trans
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
...
...
paddle/fluid/operators/math/vol2col_test.cc
浏览文件 @
330fa95c
...
...
@@ -71,7 +71,7 @@ void testVol2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopy
(
input_tmp
,
*
place
,
*
context
,
&
input
,
true
);
paddle
::
framework
::
TensorCopy
Sync
(
input_tmp
,
*
place
,
&
input
);
}
output
.
mutable_data
<
float
>
({
1
,
filter_size
,
filter_size
,
filter_size
,
output_depth
,
output_height
,
output_width
},
...
...
@@ -85,8 +85,7 @@ void testVol2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_cfo_ptr
=
output
.
data
<
float
>
();
}
else
{
TensorCopy
(
output
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
output_tmp
,
true
);
TensorCopySync
(
output
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_cfo_ptr
=
output_tmp
.
data
<
float
>
();
}
...
...
@@ -100,7 +99,7 @@ void testVol2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
TensorCopy
(
input_tmp
,
*
place
,
*
context
,
&
input
,
true
);
TensorCopy
Sync
(
input_tmp
,
*
place
,
&
input
);
}
paddle
::
operators
::
math
::
Col2VolFunctor
<
DeviceContext
,
float
>
col2vol
;
...
...
@@ -110,7 +109,7 @@ void testVol2col() {
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
TensorCopy
(
input
,
paddle
::
platform
::
CPUPlace
(),
*
context
,
&
input_tmp
,
true
);
TensorCopy
Sync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
...
...
paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
浏览文件 @
330fa95c
...
...
@@ -180,8 +180,7 @@ void DoubleBufferReader::PrefetchThreadFunc() {
auto
*
gpu_ctx
=
ctxs_
[
cached_tensor_id
].
get
();
gpu_batch
.
resize
(
cpu_batch
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu_batch
.
size
();
++
i
)
{
framework
::
TensorCopy
(
cpu_batch
[
i
],
place_
,
*
gpu_ctx
,
&
gpu_batch
[
i
],
true
);
framework
::
TensorCopySync
(
cpu_batch
[
i
],
place_
,
&
gpu_batch
[
i
]);
gpu_batch
[
i
].
set_lod
(
cpu_batch
[
i
].
lod
());
}
}
...
...
paddle/fluid/operators/reshape_op.h
浏览文件 @
330fa95c
...
...
@@ -124,10 +124,8 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto
*
shape_data
=
shape_tensor
->
data
<
int
>
();
framework
::
Tensor
cpu_shape_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
TensorCopy
(
*
shape_tensor
,
platform
::
CPUPlace
(),
ctx
.
device_context
(),
&
cpu_shape_tensor
);
TensorCopySync
(
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
ctx
.
device_context
().
Wait
();
}
auto
shape
=
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
shape_tensor
->
numel
());
...
...
@@ -146,9 +144,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
out
);
ctx
.
device_context
().
Wait
();
// TensorCopy will resize to in_dims.
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录