Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
e41a3fcd
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e41a3fcd
编写于
10月 16, 2018
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix update to develop hang problem.
上级
804dd7da
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
675 addition
and
403 deletion
+675
-403
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+14
-14
paddle/fluid/framework/data_type_transform.cu
paddle/fluid/framework/data_type_transform.cu
+106
-15
paddle/fluid/framework/ir/node.cc
paddle/fluid/framework/ir/node.cc
+5
-2
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+4
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+1
-1
paddle/fluid/framework/tensor_util.cu
paddle/fluid/framework/tensor_util.cu
+362
-15
paddle/fluid/inference/api/demo_ci/inference_icnet.cc
paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+157
-246
paddle/fluid/inference/api/demo_ci/naive_model_test.cc
paddle/fluid/inference/api/demo_ci/naive_model_test.cc
+0
-97
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+13
-6
paddle/fluid/operators/load_combine_op.cc
paddle/fluid/operators/load_combine_op.cc
+9
-7
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+4
-0
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
e41a3fcd
...
...
@@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
if
(
WITH_GPU
)
if
(
WIN32
)
windows_symbolic
(
tensor_util SRCS tensor_util.cu
)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
add_dependencies
(
tensor tensor_util
)
else
()
# //
if (WIN32)
# //
windows_symbolic(tensor_util SRCS tensor_util.cu)
# //
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
# //
add_dependencies(tensor tensor_util)
# //
else()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
endif
(
WIN32
)
#
endif(WIN32)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
endif
()
...
...
@@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function
)
if
(
WITH_GPU
)
if
(
WIN32
)
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
windows_symbolic
(
hidden_file SRCS data_type_transform.cu
)
nv_library
(
data_type_transform SRCS .data_type_transform.cu DEPS tensor
)
add_dependencies
(
data_type_transform hidden_file
)
else
()
#
if (WIN32)
#
# windows treat symbolic file as a real file, which is different with unix
#
# We create a hidden file and compile it instead of origin source file.
#
windows_symbolic(hidden_file SRCS data_type_transform.cu)
#
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
#
add_dependencies(data_type_transform hidden_file)
#
else()
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
endif
(
WIN32
)
#
endif(WIN32)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
...
...
paddle/fluid/framework/data_type_transform.cu
浏览文件 @
e41a3fcd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
data_type_transform
.
cc
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
InType
,
typename
OutType
>
struct
CastDataTypeFunctor
{
HOSTDEVICE
inline
OutType
operator
()(
InType
in
)
const
{
return
static_cast
<
OutType
>
(
in
);
}
};
template
<
typename
InType
>
struct
CastDataType
{
CastDataType
(
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
,
const
platform
::
DeviceContext
*
ctx
)
:
in_
(
in
),
out_
(
out
),
ctx_
(
ctx
)
{}
const
framework
::
Tensor
in_
;
framework
::
Tensor
*
out_
;
const
platform
::
DeviceContext
*
ctx_
;
template
<
typename
OutType
>
void
apply
()
{
auto
*
in_begin
=
in_
.
data
<
InType
>
();
auto
*
in_end
=
in_begin
+
in_
.
numel
();
auto
*
out_begin
=
out_
->
mutable_data
<
OutType
>
(
in_
.
place
());
if
(
platform
::
is_cpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#ifdef __NVCC__
}
else
if
(
platform
::
is_gpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
context
->
Wait
();
#endif
}
else
{
PADDLE_THROW
(
"Unsupported place!"
);
}
}
};
void
TransDataType
(
const
OpKernelType
&
kernel_type_for_var
,
const
OpKernelType
&
expected_kernel_type
,
const
Tensor
&
in
,
Tensor
*
out
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
out
->
Resize
(
in
.
dims
());
auto
src_type
=
kernel_type_for_var
.
data_type_
;
auto
dst_type
=
expected_kernel_type
.
data_type_
;
auto
ctx
=
pool
.
Get
(
in
.
place
());
switch
(
src_type
)
{
case
proto
::
VarType
::
FP16
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
platform
::
float16
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
FP32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
FP64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
double
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
INT32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
INT64
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
int64_t
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
BOOL
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
bool
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
INT16
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
bool
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
UINT8
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
bool
>
(
in
,
out
,
ctx
));
break
;
default:
PADDLE_THROW
(
"Not support type %d"
,
src_type
);
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/node.cc
浏览文件 @
e41a3fcd
...
...
@@ -17,8 +17,11 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
namespace
ir
{
constexpr
char
Node
::
kControlDepVarName
[];
#if !defined(_WIN32)
constexpr
char
Node
::
kControlDepVarName
[]
=
"__control_var"
;
#else
const
char
Node
::
kControlDepVarName
[]
=
"__control_var"
;
#endif
int
Node
::
count_
=
0
;
}
// namespace ir
}
// namespace framework
...
...
paddle/fluid/framework/ir/node.h
浏览文件 @
e41a3fcd
...
...
@@ -27,7 +27,11 @@ namespace ir {
class
Node
{
public:
enum
class
Type
{
kOperation
,
kVariable
};
#if !defined(_WIN32) // msvc not support constexpr correctly.
static
constexpr
char
kControlDepVarName
[]
=
"__control_var"
;
#else
static
const
char
kControlDepVarName
[];
#endif
explicit
Node
(
const
std
::
string
&
name
,
Type
type
)
:
name_
(
name
),
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
e41a3fcd
...
...
@@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
VLOG
(
3
)
<<
"expected_kernel_key:
"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
#ifdef PADDLE_WITH_MKLDNN
...
...
paddle/fluid/framework/tensor_util.cu
浏览文件 @
e41a3fcd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
tensor_util
.
cc
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace
paddle
{
namespace
framework
{
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
));
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
src_gpu_place
,
ctx_gpu_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
));
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
dst_gpu_place
,
ctx_gpu_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
));
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
{
if
(
platform
::
is_same_place
(
ctx_place
,
src_place
))
{
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
platform
::
DeviceContextPool
::
Instance
().
Get
(
src
.
place
())
->
Wait
();
}
else
if
(
platform
::
is_same_place
(
ctx_place
,
dst_place
))
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
src
.
place
())
->
Wait
();
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
{
PADDLE_THROW
(
"ctx is not belong to dst_gpu_place or src_gpu_place."
);
}
}
}
#endif
}
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
;
if
(
platform
::
is_gpu_place
(
dst_place
))
{
dev_ctx
=
pool
.
Get
(
dst_place
);
}
else
{
dev_ctx
=
pool
.
Get
(
src
.
place
());
}
TensorCopy
(
src
,
dst_place
,
*
dev_ctx
,
dst
);
}
void
TensorCopySync
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
#endif
}
template
<
typename
Predicate
,
typename
DevCtx
>
struct
AnyDTypeVisitor
{
Predicate
predicate_
;
const
Tensor
&
tensor_
;
const
DevCtx
&
ctx_
;
Tensor
*
out_
;
AnyDTypeVisitor
(
Predicate
predicate
,
const
Tensor
&
tensor
,
const
DevCtx
&
ctx
,
Tensor
*
out
)
:
predicate_
(
predicate
),
tensor_
(
tensor
),
ctx_
(
ctx
),
out_
(
out
)
{}
template
<
typename
T
>
void
apply
()
const
{
auto
t
=
EigenVector
<
T
>::
Flatten
(
tensor_
);
auto
o
=
EigenScalar
<
bool
>::
From
(
*
out_
);
// return any of predicate_(t) is true.
o
.
device
(
*
ctx_
.
eigen_device
())
=
predicate_
(
t
).
any
();
}
};
template
<
typename
Predicate
,
typename
DevCtx
>
inline
void
AnyImpl
(
Predicate
predicate
,
const
framework
::
Tensor
&
tensor
,
const
DevCtx
&
ctx
,
framework
::
Tensor
*
out
)
{
VisitDataType
(
ToDataType
(
tensor
.
type
()),
AnyDTypeVisitor
<
Predicate
,
DevCtx
>
(
predicate
,
tensor
,
ctx
,
out
));
}
template
<
typename
Predicate
>
struct
AnyVisitor
:
public
boost
::
static_visitor
<
bool
>
{
const
framework
::
Tensor
&
tensor_
;
Predicate
predicate_
;
AnyVisitor
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
:
tensor_
(
tensor
),
predicate_
(
std
::
move
(
predicate
))
{}
template
<
typename
Place
>
bool
operator
()(
const
Place
&
place
)
const
{
framework
::
Tensor
out
;
out
.
Resize
({
1
});
out
.
mutable_data
<
bool
>
(
place
);
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
place
);
AnyImpl
(
predicate_
,
tensor_
,
*
ctx
,
&
out
);
return
this
->
GetResult
(
out
,
place
);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CUDAPlace
&
gpu
)
const
{
platform
::
CPUPlace
cpu
;
framework
::
Tensor
tmp
;
tmp
.
Resize
({
1
});
tmp
.
mutable_data
<
bool
>
(
cpu
);
auto
gpuctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
gpu
);
gpuctx
->
Wait
();
TensorCopy
(
out
,
cpu
,
*
gpuctx
,
&
tmp
);
gpuctx
->
Wait
();
return
GetResult
(
tmp
,
cpu
);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CPUPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CUDAPinnedPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
}
};
template
<
typename
Predicate
>
inline
bool
Any
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
{
AnyVisitor
<
Predicate
>
visitor
(
tensor
,
predicate
);
auto
place
=
tensor
.
place
();
return
platform
::
VisitPlace
(
place
,
visitor
);
}
struct
ContainsNANPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
->
decltype
(
std
::
declval
<
T
>
().
isnan
())
{
// Cast eigen_vector to vector of bool. true if is inf.
return
eigen_vec
.
isnan
();
}
};
bool
TensorContainsNAN
(
const
framework
::
Tensor
&
tensor
)
{
ContainsNANPredicate
predicate
;
return
Any
(
tensor
,
predicate
);
}
struct
ContainsInfPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
->
decltype
(
std
::
declval
<
T
>
().
isinf
())
{
// Cast eigen_vector to vector of bool. true if is inf.
return
eigen_vec
.
isinf
();
}
};
bool
TensorContainsInf
(
const
framework
::
Tensor
&
tensor
)
{
ContainsInfPredicate
predicate
;
return
Any
(
tensor
,
predicate
);
}
void
TensorToStream
(
std
::
ostream
&
os
,
const
Tensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
{
// the 1st field, uint32_t version
constexpr
uint32_t
version
=
0
;
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
version
),
sizeof
(
version
));
}
{
// the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto
::
VarType
::
TensorDesc
desc
;
desc
.
set_data_type
(
framework
::
ToDataType
(
tensor
.
type
()));
auto
dims
=
framework
::
vectorize
(
tensor
.
dims
());
auto
*
pb_dims
=
desc
.
mutable_dims
();
pb_dims
->
Resize
(
static_cast
<
int
>
(
dims
.
size
()),
0
);
std
::
copy
(
dims
.
begin
(),
dims
.
end
(),
pb_dims
->
begin
());
int32_t
size
=
desc
.
ByteSize
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
auto
out
=
desc
.
SerializeAsString
();
os
.
write
(
out
.
data
(),
size
);
}
{
// the 3rd field, tensor data
uint64_t
size
=
tensor
.
numel
()
*
framework
::
SizeOfType
(
tensor
.
type
());
auto
*
data_ptr
=
tensor
.
data
<
void
>
();
PADDLE_ENFORCE
(
size
<
std
::
numeric_limits
<
std
::
streamsize
>::
max
(),
"Index overflow when writing tensor"
);
if
(
platform
::
is_gpu_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUDA
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
gpu_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
platform
::
CPUPlace
cpu
;
uintptr_t
data
=
reinterpret_cast
<
uintptr_t
>
(
data_ptr
);
while
(
size
!=
0
)
{
size_t
size_to_write
=
std
::
min
(
kBufSize
,
static_cast
<
size_t
>
(
size
));
memory
::
Copy
(
cpu
,
buf
.
get
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
.
place
()),
reinterpret_cast
<
const
void
*>
(
data
),
size_to_write
,
gpu_dev_ctx
.
stream
());
gpu_dev_ctx
.
Wait
();
os
.
write
(
buf
.
get
(),
size_to_write
);
data
+=
size_to_write
;
size
-=
size_to_write
;
}
#else
PADDLE_THROW
(
"Unexpected branch"
);
#endif
}
else
{
os
.
write
(
static_cast
<
const
char
*>
(
data_ptr
),
static_cast
<
std
::
streamsize
>
(
size
));
}
}
}
struct
DeserializedDataFunctor
{
DeserializedDataFunctor
(
void
**
buf
,
Tensor
*
tensor
,
const
platform
::
Place
&
place
)
:
buf_
(
buf
),
tensor_
(
tensor
),
place_
(
place
)
{}
template
<
typename
T
>
void
apply
()
{
*
buf_
=
tensor_
->
mutable_data
<
T
>
(
place_
);
}
void
**
buf_
;
Tensor
*
tensor_
;
platform
::
Place
place_
;
};
void
TensorFromStream
(
std
::
istream
&
is
,
Tensor
*
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
uint32_t
version
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
version
),
sizeof
(
version
));
PADDLE_ENFORCE_EQ
(
version
,
0U
,
"Only version 0 is supported"
);
proto
::
VarType
::
TensorDesc
desc
;
{
// int32_t size
// proto buffer
int32_t
size
;
is
.
read
(
reinterpret_cast
<
char
*>
(
&
size
),
sizeof
(
size
));
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
size
]);
is
.
read
(
reinterpret_cast
<
char
*>
(
buf
.
get
()),
size
);
PADDLE_ENFORCE
(
desc
.
ParseFromArray
(
buf
.
get
(),
size
),
"Cannot parse tensor desc"
);
}
{
// read tensor
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
static_cast
<
size_t
>
(
desc
.
dims
().
size
()));
std
::
copy
(
desc
.
dims
().
begin
(),
desc
.
dims
().
end
(),
std
::
back_inserter
(
dims
));
tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
void
*
buf
;
auto
ctx
=
platform
::
CPUDeviceContext
();
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
framework
::
ToTypeIndex
(
desc
.
data_type
()));
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
Tensor
cpu_tensor
;
cpu_tensor
.
Resize
(
framework
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
desc
.
data_type
(),
DeserializedDataFunctor
(
&
buf
,
&
cpu_tensor
,
ctx
.
GetPlace
()));
is
.
read
(
static_cast
<
char
*>
(
buf
),
size
);
auto
dst_place
=
dev_ctx
.
GetPlace
();
framework
::
TensorCopy
(
cpu_tensor
,
dst_place
,
dev_ctx
,
tensor
);
#else
PADDLE_THROW
(
"Unexpected branch"
);
#endif
}
else
{
framework
::
VisitDataType
(
desc
.
data_type
(),
DeserializedDataFunctor
(
&
buf
,
tensor
,
ctx
.
GetPlace
()));
is
.
read
(
static_cast
<
char
*>
(
buf
),
size
);
}
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/api/demo_ci/inference_icnet.cc
浏览文件 @
e41a3fcd
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains a simple demo for how to take a model for inference.
*/
#include <cassert>
#include <cctype>
#include <algorithm>
#include <fstream>
#include <chrono>
#include <iostream>
#include <
iterator>
#include <
memory>
#include <
sstream>
#include <
fstream>
#include <
algorithm>
#include <
vector>
#include <string>
#include <thread> //NOLINT
#include "paddle/fluid/inference/paddle_inference_api.h"
std
::
string
MODELDIR
=
""
;
/* "Directory of the inference model." */
// NOLINT
std
::
string
REFER
=
""
;
/*"path to reference result for comparison."*/
//NOTLINT
/*path of data; each line is a record, format:
<space splitted floats as data>\t<space splitted ints as shape>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace
paddle
{
Please check the demo data of data.txt for details.
*/
std
::
string
DATA
=
""
;
bool
USE_GPU
=
true
;
/*"Whether use gpu."*/
std
::
string
DIRNAME
=
"./Release/infer_model"
;
std
::
string
DATA
=
"./test-image.txt"
;
const
int
C
=
3
;
// image channel
const
int
H
=
449
;
// image height
const
int
W
=
581
;
// image width
// 数据格式
// "<space splitted floats as data>\t<space splitted ints as shape"
// 1. 存储为float32格式。
// 2. 必须减去均值。 CHW三个通道为 mean = 112.15, 109.41, 185.42
auto
message_err
=
[]()
struct
Record
{
std
::
cout
<<
"Copyright (c) 2018 PaddlePaddle Authors."
<<
std
::
endl
;
std
::
cout
<<
"Demo Case for windows inference. "
<<
"
\n
"
<<
"Usage: Input your model path and use_gpu as the guide requires,"
<<
"then run the demo inference, and will get a result."
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
vector
<
float
>
data
;
std
::
vector
<
int32_t
>
shape
;
};
namespace
paddle
{
namespace
demo
{
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>*
pieces
)
{
NativeConfig
GetConfig
()
{
NativeConfig
config
;
config
.
prog_file
=
DIRNAME
+
"/__model__"
;
config
.
param_file
=
DIRNAME
+
"/__params__"
;
config
.
fraction_of_gpu_memory
=
0.0
;
config
.
use_gpu
=
true
;
config
.
device
=
0
;
return
config
;
}
using
Time
=
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
());
Time
time
()
{
return
std
::
chrono
::
high_resolution_clock
::
now
();
};
double
time_diff
(
Time
t1
,
Time
t2
)
{
typedef
std
::
chrono
::
microseconds
ms
;
auto
diff
=
t2
-
t1
;
ms
counter
=
std
::
chrono
::
duration_cast
<
ms
>
(
diff
);
return
counter
.
count
()
/
1000.0
;
}
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>*
pieces
)
{
pieces
->
clear
();
if
(
str
.
empty
())
{
if
(
str
.
empty
())
{
return
;
}
size_t
pos
=
0
;
size_t
next
=
str
.
find
(
sep
,
pos
);
while
(
next
!=
std
::
string
::
npos
)
{
while
(
next
!=
std
::
string
::
npos
)
{
pieces
->
push_back
(
str
.
substr
(
pos
,
next
-
pos
));
pos
=
next
+
1
;
next
=
str
.
find
(
sep
,
pos
);
}
if
(
!
str
.
substr
(
pos
).
empty
())
{
if
(
!
str
.
substr
(
pos
).
empty
())
{
pieces
->
push_back
(
str
.
substr
(
pos
));
}
}
/*
* Get a summary of a PaddleTensor content.
*/
std
::
string
SummaryTensor
(
const
PaddleTensor
&
tensor
)
{
std
::
stringstream
ss
;
int
num_elems
=
tensor
.
data
.
length
()
/
PaddleDtypeSize
(
tensor
.
dtype
);
ss
<<
"data[:10]
\t
"
;
switch
(
tensor
.
dtype
)
{
case
PaddleDType
::
INT64
:
for
(
int
i
=
0
;
i
<
std
::
min
(
num_elems
,
10
);
i
++
)
{
ss
<<
static_cast
<
int64_t
*>
(
tensor
.
data
.
data
())[
i
]
<<
" "
;
}
break
;
case
PaddleDType
::
FLOAT32
:
for
(
int
i
=
0
;
i
<
std
::
min
(
num_elems
,
10
);
i
++
)
{
ss
<<
static_cast
<
float
*>
(
tensor
.
data
.
data
())[
i
]
<<
" "
;
}
break
;
}
return
ss
.
str
();
}
std
::
string
ToString
(
const
NativeConfig
&
config
)
{
std
::
stringstream
ss
;
ss
<<
"Use GPU : "
<<
(
config
.
use_gpu
?
"True"
:
"False"
)
<<
"
\n
"
<<
"Device : "
<<
config
.
device
<<
"
\n
"
<<
"fraction_of_gpu_memory : "
<<
config
.
fraction_of_gpu_memory
<<
"
\n
"
<<
"specify_input_name : "
<<
(
config
.
specify_input_name
?
"True"
:
"False"
)
<<
"
\n
"
<<
"Program File : "
<<
config
.
prog_file
<<
"
\n
"
<<
"Param File : "
<<
config
.
param_file
;
return
ss
.
str
();
}
struct
Record
{
std
::
vector
<
float
>
data
;
std
::
vector
<
int32_t
>
shape
;
};
}
Record
ProcessALine
(
const
std
::
string
&
line
)
{
std
::
cout
<<
"process a line"
<<
std
::
endl
;
Record
ProcessALine
(
const
std
::
string
&
line
)
{
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
assert
(
columns
.
size
()
==
2UL
,
"data format error, should be <data>
\t
<shape>"
);
Record
record
;
std
::
vector
<
std
::
string
>
data_strs
;
split
(
columns
[
0
],
' '
,
&
data_strs
);
//将数据字符串转换为整型数据并放到record.data中
for
(
auto
&
d
:
data_strs
)
{
for
(
auto
&
d
:
data_strs
)
{
record
.
data
.
push_back
(
std
::
stof
(
d
));
}
std
::
vector
<
std
::
string
>
shape_strs
;
split
(
columns
[
1
],
' '
,
&
shape_strs
);
for
(
auto
&
s
:
shape_strs
)
{
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
std
::
cout
<<
"data size "
<<
record
.
data
.
size
()
<<
std
::
endl
;
std
::
cout
<<
"data shape size "
<<
record
.
shape
.
size
()
<<
std
::
endl
;
return
record
;
}
}
void
CheckOutput
(
const
std
::
string
&
referfile
,
const
PaddleTensor
&
output
)
{
std
::
string
line
;
std
::
ifstream
file
(
referfile
);
std
::
getline
(
file
,
line
);
auto
refer
=
ProcessALine
(
line
);
file
.
close
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
std
::
cout
<<
"predictor output numel "
<<
numel
<<
std
::
endl
;
std
::
cout
<<
"reference output numel "
<<
refer
.
data
.
size
()
<<
std
::
endl
;
assert
(
numel
==
refer
.
data
.
size
());
switch
(
output
.
dtype
)
{
case
PaddleDType
::
INT64
:
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
assert
(
static_cast
<
int64_t
*>
(
output
.
data
.
data
())[
i
]
==
refer
.
data
[
i
]);
}
break
;
case
PaddleDType
::
FLOAT32
:
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
assert
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
])
<=
1e-5
);
}
break
;
}
}
void
test_naive
(
int
batch_size
){
NativeConfig
config
=
GetConfig
();
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
int
height
=
H
;
int
width
=
W
;
int
channel
=
C
;
int
num_sum
=
height
*
width
*
channel
*
batch_size
;
/*
* Use the native fluid engine to inference the demo.
*/
void
Main
(
bool
use_gpu
)
{
NativeConfig
config
;
config
.
model_dir
=
MODELDIR
;
//config.param_file = MODELDIR + "/__params__";
//config.prog_file = MODELDIR + "/__model__";
config
.
use_gpu
=
USE_GPU
;
config
.
device
=
0
;
if
(
USE_GPU
)
{
config
.
fraction_of_gpu_memory
=
0.1
f
;
// set by yourself
}
std
::
cout
<<
ToString
(
config
)
<<
std
::
endl
;
std
::
cout
<<
"init predictor"
<<
std
::
endl
;
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
std
::
cout
<<
"begin to process data"
<<
std
::
endl
;
// Just a single batch of data.
std
::
string
line
;
std
::
cout
<<
"data : "
<<
std
::
endl
;
std
::
ifstream
file
(
DATA
);
if
(
!
file
.
is_open
())
{
std
::
cout
<<
"failed open data"
<<
DATA
<<
std
::
endl
;
exit
(
0
);
}
std
::
getline
(
file
,
line
);
auto
record
=
ProcessALine
(
line
);
file
.
close
();
// Inference.
PaddleTensor
input
;
input
.
shape
=
record
.
shape
;
input
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
cout
<<
"run executor"
<<
std
::
endl
;
std
::
vector
<
PaddleTensor
>
output
;
predictor
->
Run
({
input
},
&
output
);
std
::
cout
<<
"output.size "
<<
output
.
size
()
<<
std
::
endl
;
auto
&
tensor
=
output
.
front
();
std
::
cout
<<
"output: "
<<
SummaryTensor
(
tensor
)
<<
std
::
endl
;
// compare with reference result
std
::
cout
<<
"refer result : "
<<
REFER
<<
std
::
endl
;
CheckOutput
(
REFER
,
tensor
);
// 1. use fake data
std
::
vector
<
float
>
data
;
for
(
int
i
=
0
;
i
<
num_sum
;
i
++
)
{
data
.
push_back
(
0.0
);
}
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
batch_size
,
channel
,
height
,
width
});
tensor
.
data
.
Resize
(
sizeof
(
float
)
*
batch_size
*
channel
*
height
*
width
);
std
::
copy
(
data
.
begin
(),
data
.
end
(),
static_cast
<
float
*>
(
tensor
.
data
.
data
()));
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
// 2. read data from file
// std::string line;
// std::ifstream file(DATA);
// std::getline(file, line);
// auto record = ProcessALine(line);
// file.close();
// PaddleTensor tensor;
// tensor.shape = record.shape;
// tensor.data =
// PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
PaddleTensor
tensor_out
;
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
,
batch_size
);
auto
time1
=
time
();
for
(
size_t
i
=
0
;
i
<
2
;
i
++
)
{
std
::
cout
<<
"Pass "
<<
i
<<
"predict"
;
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
,
batch_size
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
MODELDIR
=
"./LB_icnet_model"
;
//DATA = "./icnet_image.txt";
DATA
=
"./1.png.txt"
;
REFER
=
"./icnet_label.txt"
;
paddle
::
demo
::
Main
(
USE_GPU
);
auto
time2
=
time
();
std
::
ofstream
ofresult
(
"naive_test_result.txt"
,
std
::
ios
::
app
);
std
::
cout
<<
"batch: "
<<
batch_size
<<
" predict cost: "
<<
time_diff
(
time1
,
time2
)
/
100.0
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
outputs
.
size
()
<<
std
::
endl
;
}
}
// namespace paddle
system
(
"pause"
);
int
main
(
int
argc
,
char
**
argv
)
{
paddle
::
test_naive
(
1
<<
0
);
return
0
;
}
\ No newline at end of file
paddle/fluid/inference/api/demo_ci/naive_model_test.cc
已删除
100644 → 0
浏览文件 @
804dd7da
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <iostream>
#include <fstream>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace
paddle
{
std
::
string
DIRNAME
=
"./LB_icnet_model"
;
//std::string DIRNAME = "./infer_models";
NativeConfig
GetConfig
()
{
NativeConfig
config
;
config
.
prog_file
=
DIRNAME
+
"/__model__"
;
config
.
param_file
=
DIRNAME
+
"/__params__"
;
config
.
fraction_of_gpu_memory
=
0.8
;
config
.
use_gpu
=
true
;
config
.
device
=
0
;
return
config
;
}
using
Time
=
decltype
(
std
::
chrono
::
high_resolution_clock
::
now
());
Time
time
()
{
return
std
::
chrono
::
high_resolution_clock
::
now
();
};
double
time_diff
(
Time
t1
,
Time
t2
)
{
typedef
std
::
chrono
::
microseconds
ms
;
auto
diff
=
t2
-
t1
;
ms
counter
=
std
::
chrono
::
duration_cast
<
ms
>
(
diff
);
return
counter
.
count
()
/
1000.0
;
}
void
test_naive
(
int
batch_size
){
NativeConfig
config
=
GetConfig
();
// config.model_dir = model_path;
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
int
height
=
449
;
int
width
=
581
;
//int height = 3;
//int width = 3;
int
num_sum
=
height
*
width
*
3
*
batch_size
;
std
::
vector
<
float
>
data
;
for
(
int
i
=
0
;
i
<
num_sum
;
i
++
)
{
data
.
push_back
(
0.0
);
}
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
batch_size
,
3
,
height
,
width
});
tensor
.
data
.
Resize
(
sizeof
(
float
)
*
batch_size
*
3
*
height
*
width
);
std
::
copy
(
data
.
begin
(),
data
.
end
(),
static_cast
<
float
*>
(
tensor
.
data
.
data
()));
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
PaddleTensor
tensor_out
;
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
,
batch_size
);
std
::
cout
<<
"start predict123:"
<<
std
::
endl
;
auto
time1
=
time
();
for
(
size_t
i
=
0
;
i
<
2
;
i
++
)
{
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
,
batch_size
);
std
::
cout
<<
"pass "
<<
i
;
}
auto
time2
=
time
();
std
::
ofstream
ofresult
(
"naive_test_result.txt"
,
std
::
ios
::
app
);
std
::
cout
<<
"batch: "
<<
batch_size
<<
" predict cost: "
<<
time_diff
(
time1
,
time2
)
/
100.0
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
outputs
.
size
()
<<
std
::
endl
;
/*
int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
ofresult << std::to_string(data_o[j]) << " ";
}
ofresult << std::endl;
ofresult.close();
*/
}
}
// namespace paddle
int
main
(
int
argc
,
char
**
argv
)
{
paddle
::
test_naive
(
1
<<
0
);
return
0
;
}
\ No newline at end of file
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
e41a3fcd
...
...
@@ -43,6 +43,7 @@ template <typename T>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
VLOG
(
3
)
<<
"inside cudnn"
;
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace."
);
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
...
...
@@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
VLOG
(
3
)
<<
"get all inputs"
;
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor
input_desc
;
ScopedTensorDescriptor
output_desc
;
...
...
@@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t
cudnn_conv_desc
=
conv_desc
.
descriptor
<
T
>
(
paddings
,
strides
,
dilations
);
VLOG
(
3
)
<<
"create tensor descriptor"
;
#if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
...
...
@@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
groups
));
groups
=
1
;
#endif
VLOG
(
3
)
<<
"before create tensor descriptor"
;
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize2int
(
input
->
dims
()),
groups
);
cudnnTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
...
...
@@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_height
=
output
->
dims
()[
2
];
output_width
=
output
->
dims
()[
3
];
}
VLOG
(
3
)
<<
"after create tensor descriptor"
;
int
group_offset_in
=
input_channels
/
groups
*
input_height
*
input_width
*
input_depth
;
int
group_offset_out
=
...
...
@@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
VLOG
(
3
)
<<
"set cudnn algorithm"
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
...
...
@@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
VLOG
(
3
)
<<
"before get workspace"
;
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
...
...
@@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
VLOG
(
3
)
<<
"after get workspace"
;
// Allocate on GPU memory
platform
::
CUDAPlace
gpu
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
workspace_size_in_bytes
=
1024
;
cudnn_workspace
=
paddle
::
memory
::
Alloc
(
gpu
,
workspace_size_in_bytes
);
VLOG
(
3
)
<<
"allocate memory"
;
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
...
...
@@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
}
VLOG
(
3
)
<<
"cudnn forward"
;
// Release the cudnn workspace
paddle
::
memory
::
Free
(
gpu
,
cudnn_workspace
);
VLOG
(
3
)
<<
"cudnn pass"
;
}
};
...
...
@@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Already on GPU
void
*
cudnn_workspace
=
nullptr
;
platform
::
CUDAPlace
gpu
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
workspace_size_in_bytes
=
1024
;
cudnn_workspace
=
paddle
::
memory
::
Alloc
(
gpu
,
workspace_size_in_bytes
);
// ------------------- cudnn conv backward data ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
...
...
paddle/fluid/operators/load_combine_op.cc
浏览文件 @
e41a3fcd
...
...
@@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
auto
filename
=
Attr
<
std
::
string
>
(
"file_path"
);
auto
load_as_fp16
=
Attr
<
bool
>
(
"load_as_fp16"
);
std
::
ifstream
fin
(
filename
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
std
::
ifstream
fin
(
filename
,
std
::
ios_base
::
in
|
std
::
ios_base
::
binary
);
PADDLE_ENFORCE
(
!
fin
.
bad
(
),
"Cannot open file %s for load_combine op"
,
filename
);
auto
out_var_names
=
Outputs
(
"Out"
);
...
...
@@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
for
(
size_t
i
=
0
;
i
<
out_var_names
.
size
();
i
++
)
{
VLOG
(
3
)
<<
"load "
<<
out_var_names
[
i
];
auto
*
out_var
=
scope
.
FindVar
(
out_var_names
[
i
]);
PADDLE_ENFORCE
(
out_var
!=
nullptr
,
"Output variable %s cannot be found"
,
out_var_names
[
i
]);
auto
*
tensor
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
VLOG
(
3
)
<<
"Get Tensor"
;
// Error checking
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot read more from file %s"
,
PADDLE_ENFORCE
(
!
fin
.
bad
(
),
"Cannot read more from file %s"
,
filename
);
VLOG
(
3
)
<<
"before deserialization"
;
// Get data from fin to tensor
DeserializeFromStream
(
fin
,
tensor
,
dev_ctx
);
VLOG
(
3
)
<<
"after deserialization"
;
auto
in_dtype
=
framework
::
ToDataType
(
tensor
->
type
());
auto
out_dtype
=
load_as_fp16
?
framework
::
proto
::
VarType
::
FP16
:
in_dtype
;
...
...
@@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
tensor
->
set_lod
(
fp16_tensor
.
lod
());
tensor
->
ShareDataWith
(
fp16_tensor
);
}
VLOG
(
3
)
<<
"load "
<<
out_var_names
[
i
]
<<
" finished"
;
}
}
};
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
e41a3fcd
...
...
@@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
#if !defined(_WIN32)
#define CUDNN_ENFORCE(condition) \
do { \
cudnnStatus_t status = condition; \
...
...
@@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \
} while (false)
#else
#define CUDNN_ENFORCE(condition)
#endif
enum
class
DataLayout
{
// Not use
kNHWC
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录