Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c13efe02
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c13efe02
编写于
7月 31, 2018
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add_tensorrt_elementwise_add
上级
b241a47e
baff71d5
变更
48
隐藏空白更改
内联
并排
Showing
48 changed file
with
1010 addition
and
365 deletion
+1010
-365
paddle/fluid/framework/details/exception_holder.h
paddle/fluid/framework/details/exception_holder.h
+83
-0
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
...uid/framework/details/scope_buffered_ssa_graph_executor.h
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+5
-22
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+3
-3
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+3
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+7
-0
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+13
-17
paddle/fluid/inference/analysis/analyzer_main.cc
paddle/fluid/inference/analysis/analyzer_main.cc
+33
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+6
-2
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+13
-0
paddle/fluid/inference/analysis/data_flow_graph.h
paddle/fluid/inference/analysis/data_flow_graph.h
+2
-0
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+2
-2
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
...nference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+5
-5
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
...fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+9
-3
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
...fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+30
-6
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
.../fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+1
-1
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
...nference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+2
-1
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+15
-0
paddle/fluid/inference/analysis/model_store_pass.cc
paddle/fluid/inference/analysis/model_store_pass.cc
+61
-0
paddle/fluid/inference/analysis/model_store_pass.h
paddle/fluid/inference/analysis/model_store_pass.h
+51
-0
paddle/fluid/inference/analysis/model_store_pass_tester.cc
paddle/fluid/inference/analysis/model_store_pass_tester.cc
+43
-0
paddle/fluid/inference/analysis/pass.h
paddle/fluid/inference/analysis/pass.h
+1
-0
paddle/fluid/inference/analysis/pass_manager_tester.cc
paddle/fluid/inference/analysis/pass_manager_tester.cc
+5
-2
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+4
-4
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
...rence/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+3
-3
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
...fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+3
-4
paddle/fluid/inference/analysis/ut_helper.h
paddle/fluid/inference/analysis/ut_helper.h
+3
-18
paddle/fluid/inference/api/api_anakin_engine_tester.cc
paddle/fluid/inference/api/api_anakin_engine_tester.cc
+10
-8
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+11
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+12
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+4
-5
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+9
-8
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+5
-5
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+1
-1
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+25
-0
paddle/fluid/operators/math/im2col.cc
paddle/fluid/operators/math/im2col.cc
+10
-52
paddle/fluid/operators/math/im2col_cfo_cpu.h
paddle/fluid/operators/math/im2col_cfo_cpu.h
+252
-0
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+103
-72
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+5
-26
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+109
-74
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+10
-10
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+4
-5
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
...id/tests/unittests/test_memory_optimization_transpiler.py
+24
-0
python/paddle/fluid/tests/unittests/test_reshape_op.py
python/paddle/fluid/tests/unittests/test_reshape_op.py
+3
-3
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+1
-0
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
未找到文件。
paddle/fluid/framework/details/exception_holder.h
0 → 100644
浏览文件 @
c13efe02
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
ExceptionHolder
{
public:
void
Catch
(
const
platform
::
EnforceNotMet
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
exp
));
type_
=
kEnforceNotMet
;
}
void
Catch
(
const
platform
::
EOFException
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
exp
));
type_
=
kEOF
;
}
}
bool
ExceptionCatched
()
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
return
exception_
.
get
()
!=
nullptr
;
}
void
Throw
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
switch
(
type_
)
{
case
kNone
:
break
;
case
kEnforceNotMet
:
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exception_
.
get
());
throw
e
;
break
;
}
case
kEOF
:
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exception_
.
get
());
throw
e
;
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
exception_
.
reset
();
type_
=
kNone
;
}
void
Clear
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
();
type_
=
kNone
;
}
private:
enum
ExceptionType
{
kNone
,
kEnforceNotMet
,
kEOF
};
ExceptionType
type_
{
kNone
};
std
::
unique_ptr
<
std
::
exception
>
exception_
;
mutable
std
::
mutex
mu_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
浏览文件 @
c13efe02
...
...
@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
VariableInfo
>
var_infos
,
std
::
vector
<
platform
::
Place
>
places
,
std
::
unique_ptr
<
SSAGraphExecutor
>&&
underlying_executor
);
const
ir
::
Graph
&
Graph
()
const
{
return
underlying_executor_
->
Graph
();
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
underlying_executor_
->
Graph
();
}
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
)
override
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
c13efe02
...
...
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
run_op_futures_
.
clear
();
exception_
.
reset
();
exception_
holder_
.
Clear
();
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
...
...
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
if
(
timeout
)
{
std
::
unique_lock
<
std
::
mutex
>
l
(
exception_mu_
);
if
(
exception_
)
{
l
.
unlock
();
if
(
exception_holder_
.
ExceptionCatched
())
{
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
}
l
.
lock
();
std
::
exception
*
exp
=
exception_
.
get
();
if
(
dynamic_cast
<
platform
::
EOFException
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exp
);
throw
e
;
}
else
if
(
dynamic_cast
<
platform
::
EnforceNotMet
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exp
);
throw
e
;
}
else
{
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
exception_holder_
.
Throw
();
}
else
{
continue
;
}
...
...
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q
->
Extend
(
op
->
Outputs
());
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
}
catch
(
platform
::
EOFException
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
ex
));
}
exception_holder_
.
Catch
(
ex
);
}
catch
(
platform
::
EnforceNotMet
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
ex
));
exception_holder_
.
Catch
(
ex
);
}
catch
(...)
{
LOG
(
FATAL
)
<<
"Unknown exception catched"
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
c13efe02
...
...
@@ -24,6 +24,7 @@
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
...
...
@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
const
ir
::
Graph
&
Graph
()
const
{
return
*
graph_
;
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
...
...
@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
platform
::
DeviceContextPool
fetch_ctxs_
;
std
::
mutex
exception_mu_
;
std
::
unique_ptr
<
std
::
exception
>
exception_
;
ExceptionHolder
exception_holder_
;
std
::
atomic
<
int
>
running_ops_
;
void
InsertPendingOp
(
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
*
pending_ops
,
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
c13efe02
...
...
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
tensorrt_subgraph_node_mark_pass.cc
analyzer.cc
helper.cc
model_store_pass.cc
DEPS framework_proto proto_desc
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
...
...
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test
(
test_pass_manager SRCS pass_manager_tester.cc
)
inference_analysis_test
(
test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
)
inference_analysis_test
(
test_model_store_pass SRCS model_store_pass_tester.cc
)
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
c13efe02
...
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
...
...
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string
(
inference_analysis_graphviz_log_root
,
"./"
,
"Graphviz debuger for data flow graphs."
);
DEFINE_string
(
inference_analysis_output_storage_path
,
""
,
"optimized model output path"
);
namespace
inference
{
namespace
analysis
{
...
...
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
AddPass
(
"tensorrt-subgraph"
,
new
TensorRTSubGraphPass
(
trt_teller
));
}
AddPass
(
"data-flow-graph-to-fluid"
,
new
DataFlowGraphToFluidPass
);
if
(
!
FLAGS_inference_analysis_output_storage_path
.
empty
())
{
AddPass
(
"model-store-pass"
,
new
ModelStorePass
);
}
}
std
::
string
repr
()
const
override
{
return
"dfg-pass-manager"
;
}
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
c13efe02
...
...
@@ -16,28 +16,23 @@ limitations under the License. */
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
* control whether
* an process is applied on the program.
* and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
* multiple flags to
* control whether an process is applied on the program.
*
* The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
* a data flow
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
* graph to a
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
* which take a
* node or data flow graph as input.
* pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
* Fluid ProgramDesc to
* a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
* a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
* pipeline can be any Passes
* which take a node or data flow graph as input.
*
* The Analyzer can be used in two methods, the first is a executable file which
* can be used to
* pre-process the inference model and can be controlled by passing difference
* command flags;
* can be used to pre-process the inference model and can be controlled by
* passing difference command flags;
* the other way is to compose inside the inference API as a runtime pre-process
* phase in the
* inference service.
* phase in the inference service.
*/
#include <gflags/gflags.h>
...
...
@@ -50,6 +45,7 @@ namespace paddle {
// flag if not available.
DECLARE_bool
(
inference_analysis_enable_tensorrt_subgraph_engine
);
DECLARE_string
(
inference_analysis_graphviz_log_root
);
DECLARE_string
(
inference_analysis_output_storage_path
);
namespace
inference
{
namespace
analysis
{
...
...
paddle/fluid/inference/analysis/analyzer_main.cc
0 → 100644
浏览文件 @
c13efe02
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file implements analysizer -- an executation help to analyze and
* optimize trained model.
*/
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
int
main
(
int
argc
,
char
**
argv
)
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
using
paddle
::
inference
::
analysis
::
Analyzer
;
using
paddle
::
inference
::
analysis
::
Argument
;
Argument
argument
;
Analyzer
analyzer
;
analyzer
.
Run
(
&
argument
);
return
0
;
}
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
c13efe02
...
...
@@ -20,14 +20,18 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Test
er
,
analysis_without_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_without_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
false
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
}
TEST
_F
(
DFG_Test
er
,
analysis_with_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_with_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
true
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
c13efe02
...
...
@@ -36,6 +36,16 @@ namespace analysis {
* All the fields should be registered here for clearness.
*/
struct
Argument
{
Argument
()
=
default
;
explicit
Argument
(
const
std
::
string
&
fluid_model_dir
)
:
fluid_model_dir
(
new
std
::
string
(
fluid_model_dir
))
{}
// The directory of the trained model.
std
::
unique_ptr
<
std
::
string
>
fluid_model_dir
;
// The path of `__model__` and `param`, this is used when the file name of
// model and param is changed.
std
::
unique_ptr
<
std
::
string
>
fluid_model_program_path
;
std
::
unique_ptr
<
std
::
string
>
fluid_model_param_path
;
// The graph that process by the Passes or PassManagers.
std
::
unique_ptr
<
DataFlowGraph
>
main_dfg
;
...
...
@@ -44,6 +54,9 @@ struct Argument {
// The processed program desc.
std
::
unique_ptr
<
framework
::
proto
::
ProgramDesc
>
transformed_program_desc
;
// The output storage path of ModelStorePass.
std
::
unique_ptr
<
std
::
string
>
model_output_store_path
;
};
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
...
...
paddle/fluid/inference/analysis/data_flow_graph.h
浏览文件 @
c13efe02
...
...
@@ -36,6 +36,8 @@ namespace analysis {
/*
* DataFlowGraph - A container of Value and Function Nodes.
*
* This is the base graph for any other type of graphs, such as SSA or CFG.
*/
struct
DataFlowGraph
{
NodeMap
nodes
;
...
...
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
浏览文件 @
c13efe02
...
...
@@ -20,7 +20,7 @@ namespace inference {
namespace
analysis
{
TEST
(
DataFlowGraph
,
BFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
...
...
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
}
TEST
(
DataFlowGraph
,
DFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
GraphTraits
<
DataFlowGraph
>
trait
(
&
dfg
);
...
...
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
浏览文件 @
c13efe02
...
...
@@ -26,21 +26,21 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Test
)
{
DataFlowGraph
graph
;
TEST
(
DataFlowGraph
,
Test
)
{
Argument
argument
(
FLAGS_inference_model_dir
)
;
FluidToDataFlowGraphPass
pass0
;
DataFlowGraphToFluidPass
pass1
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass1
.
Initialize
(
&
argument
));
pass0
.
Run
(
&
graph
);
pass1
.
Run
(
&
graph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
()
);
pass1
.
Run
(
argument
.
main_dfg
.
get
()
);
pass0
.
Finalize
();
pass1
.
Finalize
();
LOG
(
INFO
)
<<
graph
.
nodes
.
size
();
LOG
(
INFO
)
<<
argument
.
main_dfg
->
nodes
.
size
();
}
};
// namespace analysis
...
...
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
浏览文件 @
c13efe02
...
...
@@ -23,12 +23,18 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST_F
(
DFG_Tester
,
dfg_graphviz_draw_pass_tester
)
{
auto
dfg
=
ProgramDescToDFG
(
*
argument
.
origin_program_desc
);
TEST
(
DFG_GraphvizDrawPass
,
dfg_graphviz_draw_pass_tester
)
{
Argument
argument
(
FLAGS_inference_model_dir
);
FluidToDataFlowGraphPass
pass0
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
pass0
.
Run
(
argument
.
main_dfg
.
get
());
// auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"test"
);
DFG_GraphvizDrawPass
pass
(
config
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
dfg
);
pass
.
Run
(
argument
.
main_dfg
.
get
()
);
// test content
std
::
ifstream
file
(
"./0-graph_test.dot"
);
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
浏览文件 @
c13efe02
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <string>
#include <vector>
...
...
@@ -25,8 +26,20 @@ namespace analysis {
bool
FluidToDataFlowGraphPass
::
Initialize
(
Argument
*
argument
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
origin_program_desc
);
PADDLE_ENFORCE
(
argument
);
if
(
argument
->
origin_program_desc
)
{
LOG
(
WARNING
)
<<
"argument's origin_program_desc is already set, might "
"duplicate called"
;
}
if
(
!
argument
->
fluid_model_program_path
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_dir
);
argument
->
fluid_model_program_path
.
reset
(
new
std
::
string
(
*
argument
->
fluid_model_dir
+
"/__model__"
));
}
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_program_path
);
auto
program
=
LoadProgramDesc
(
*
argument
->
fluid_model_program_path
);
argument
->
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
program
));
if
(
!
argument
->
main_dfg
)
{
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
}
...
...
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
desc_
);
// insert vars
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
auto
&
main_block
=
desc_
->
blocks
(
framework
::
kRootBlockIndex
);
for
(
int
i
=
0
;
i
<
main_block
.
vars_size
();
i
++
)
{
...
...
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
var2id
[
var
.
name
()]
=
v
->
id
();
}
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std
::
unordered_set
<
Node
*>
unique_written_vars
;
for
(
int
i
=
0
;
i
<
main_block
.
ops_size
();
i
++
)
{
const
auto
&
op
=
main_block
.
ops
(
i
);
auto
*
o
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kFunction
);
...
...
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o
->
SetPbMsg
(
op
.
SerializeAsString
());
// set inputs and outputs
std
::
unordered_set
<
Node
*>
inlinks
;
for
(
int
j
=
0
;
j
<
op
.
inputs_size
();
j
++
)
{
auto
&
in_var
=
op
.
inputs
(
j
);
for
(
int
k
=
0
;
k
<
in_var
.
arguments_size
();
k
++
)
{
auto
*
in
=
graph
->
nodes
.
GetMutable
(
var2id
.
at
(
in_var
.
arguments
(
k
)));
in
->
outlinks
.
push_back
(
o
);
o
->
inlinks
.
push_back
(
in
);
inlinks
.
insert
(
in
);
}
}
for
(
int
j
=
0
;
j
<
op
.
outputs_size
();
j
++
)
{
auto
&
out_var
=
op
.
outputs
(
j
);
for
(
int
k
=
0
;
k
<
out_var
.
arguments_size
();
k
++
)
{
auto
*
out
=
graph
->
nodes
.
GetMutable
(
var2id
[
out_var
.
arguments
(
k
)]);
if
(
inlink
s
.
count
(
out
))
{
if
(
unique_written_var
s
.
count
(
out
))
{
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto
*
out_alias
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kValue
);
out_alias
->
SetName
(
out
->
name
());
out_alias
->
SetPbDesc
(
out
->
pb_desc
());
out_alias
->
SetPbMsg
(
out
->
pb_msg
());
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update a -> a0
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update variable's alias Node
LOG
(
INFO
)
<<
"loop found in graph, create SSA alias node ["
<<
out_alias
->
repr
()
<<
"] for ["
<<
out
->
repr
()
<<
"]"
;
out
=
out_alias
;
}
out
->
inlinks
.
push_back
(
o
);
o
->
outlinks
.
push_back
(
out
);
unique_written_vars
.
insert
(
out
);
}
}
}
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
浏览文件 @
c13efe02
...
...
@@ -30,7 +30,7 @@ namespace inference {
namespace
analysis
{
/*
* Transform a FluidDesc to a
data flow graph
.
* Transform a FluidDesc to a
SSA
.
*/
class
FluidToDataFlowGraphPass
final
:
public
DataFlowGraphPass
{
public:
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
浏览文件 @
c13efe02
...
...
@@ -21,8 +21,9 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Ini
t
)
{
TEST
(
FluidToDataFlowGraphPass
,
Tes
t
)
{
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
// Analysis is sensitive to ProgramDesc, careful to change the original model.
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
c13efe02
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdio>
#include <fstream>
#include <string>
#include <typeindex>
#include <unordered_map>
...
...
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
}
}
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_path
)
{
std
::
ifstream
fin
(
model_path
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
fin
.
is_open
(),
"Cannot open file %s"
,
model_path
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
std
::
string
buffer
(
fin
.
tellg
(),
' '
);
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
buffer
[
0
],
buffer
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
buffer
);
return
program_desc
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/model_store_pass.cc
0 → 100644
浏览文件 @
c13efe02
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <stdio.h>
#include <stdlib.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
ModelStorePass
::
Run
(
DataFlowGraph
*
x
)
{
if
(
!
argument_
->
fluid_model_param_path
)
{
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
fluid_model_dir
);
argument_
->
fluid_model_param_path
.
reset
(
new
std
::
string
(
*
argument_
->
fluid_model_dir
+
"param"
));
}
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
model_output_store_path
);
// Directly copy param file to destination.
std
::
stringstream
ss
;
// NOTE these commands only works on linux.
ss
<<
"mkdir -p "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
ss
.
str
(
""
);
ss
<<
"cp "
<<
*
argument_
->
fluid_model_dir
<<
"/*"
<<
" "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
// Store program
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
transformed_program_desc
,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."
);
const
std
::
string
program_output_path
=
*
argument_
->
model_output_store_path
+
"/__model__"
;
std
::
ofstream
file
(
program_output_path
,
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"failed to open %s to write."
,
program_output_path
);
const
std
::
string
serialized_message
=
argument_
->
transformed_program_desc
->
SerializeAsString
();
file
.
write
(
serialized_message
.
c_str
(),
serialized_message
.
size
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass.h
0 → 100644
浏览文件 @
c13efe02
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines ModelStorePass, which store the runtime DFG to a Paddle
* model in the disk, and that model can be reloaded for prediction.
*/
#include "paddle/fluid/inference/analysis/pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
class
ModelStorePass
:
public
DataFlowGraphPass
{
public:
bool
Initialize
(
Argument
*
argument
)
override
{
if
(
!
argument
)
{
LOG
(
ERROR
)
<<
"invalid argument"
;
return
false
;
}
argument_
=
argument
;
return
true
;
}
void
Run
(
DataFlowGraph
*
x
)
override
;
std
::
string
repr
()
const
override
{
return
"DFG-store-pass"
;
}
std
::
string
description
()
const
override
{
return
R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
model in the disk, and that model can be reloaded for prediction again.)DD"
;
}
private:
Argument
*
argument_
{
nullptr
};
};
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass_tester.cc
0 → 100644
浏览文件 @
c13efe02
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
DEFINE_string
(
inference_model_dir
,
""
,
"Model path"
);
TEST
(
DFG_StorePass
,
test
)
{
Analyzer
analyzer
;
Argument
argument
(
FLAGS_inference_model_dir
);
argument
.
model_output_store_path
.
reset
(
new
std
::
string
(
"./_dfg_store_pass_tmp"
));
// disable storage in alalyzer
FLAGS_inference_analysis_output_storage_path
=
""
;
analyzer
.
Run
(
&
argument
);
ModelStorePass
pass
;
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/pass.h
浏览文件 @
c13efe02
...
...
@@ -50,6 +50,7 @@ class Pass {
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual
Pass
*
CreateGraphvizDebugerPass
()
const
{
return
nullptr
;
}
virtual
void
Run
()
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Node.
virtual
void
Run
(
Node
*
x
)
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Function.
...
...
paddle/fluid/inference/analysis/pass_manager_tester.cc
浏览文件 @
c13efe02
...
...
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
std
::
string
description
()
const
override
{
return
"some doc"
;
}
};
TEST
_F
(
DFG_Test
er
,
DFG_pass_manager
)
{
TEST
(
PassManag
er
,
DFG_pass_manager
)
{
TestDfgPassManager
manager
;
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"dfg.dot"
);
...
...
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager
.
Register
(
"graphviz"
,
new
DFG_GraphvizDrawPass
(
config
));
manager
.
Register
(
"dfg-to-fluid"
,
new
DataFlowGraphToFluidPass
);
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
&
argument
);
ASSERT_TRUE
(
manager
.
Initialize
(
&
argument
));
manager
.
RunAll
();
}
TEST_F
(
DFG_Tester
,
Node_pass_manager
)
{
TEST
(
PassManager
,
Node_pass_manager
)
{
Argument
argument
(
FLAGS_inference_model_dir
);
// Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass
pass0
;
pass0
.
Initialize
(
&
argument
);
...
...
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
浏览文件 @
c13efe02
...
...
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
return
false
;
};
TEST
_F
(
DFG_Tes
ter
,
Split
)
{
auto
desc
=
LoadProgramDesc
();
TEST
(
SubGraphSplit
ter
,
Split
)
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
LOG
(
INFO
)
<<
"spliter
\n
"
<<
dfg
.
DotString
();
...
...
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
ASSERT_EQ
(
subgraphs
.
back
().
size
(),
6UL
);
}
TEST
_F
(
DFG_Tes
ter
,
Fuse
)
{
auto
desc
=
LoadProgramDesc
();
TEST
(
SubGraphSplit
ter
,
Fuse
)
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
size_t
count0
=
dfg
.
nodes
.
size
();
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
浏览文件 @
c13efe02
...
...
@@ -22,11 +22,11 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
tensorrt_subgraph_node_mark_pass
)
{
TEST
(
TensorRTSubgraphNodeMarkPass
,
test
)
{
// init
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
pass
.
Initialize
(
&
argument
));
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
TensorRTSubgraphNodeMarkPass
::
teller_t
teller
=
[](
const
Node
*
node
)
{
...
...
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
for
(
auto
&
node
:
argument
.
main_dfg
->
nodes
.
nodes
())
{
counter
+=
node
->
attr
(
ATTR_supported_by_tensorrt
).
Bool
();
}
ASSERT_EQ
(
counter
,
2
);
LOG
(
INFO
)
<<
counter
<<
" nodes marked"
;
}
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
浏览文件 @
c13efe02
...
...
@@ -25,7 +25,7 @@ namespace analysis {
DEFINE_string
(
dot_dir
,
"./"
,
""
);
TEST
_F
(
DFG_Tester
,
tensorrt_single_pass
)
{
TEST
(
TensorRTSubGraphPass
,
main
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"elementwise_add"
,
"mul"
,
"sigmoid"
});
SubGraphSplitter
::
NodeInsideSubgraphTeller
teller
=
[
&
](
const
Node
*
node
)
{
...
...
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
return
false
;
};
LOG
(
INFO
)
<<
"init"
;
Argument
argument
(
FLAGS_inference_model_dir
);
DFG_GraphvizDrawPass
::
Config
config
{
FLAGS_dot_dir
,
"origin"
};
DFG_GraphvizDrawPass
::
Config
config1
{
FLAGS_dot_dir
,
"fusion"
};
...
...
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
FluidToDataFlowGraphPass
pass0
;
TensorRTSubGraphPass
trt_pass
(
std
::
move
(
teller
));
LOG
(
INFO
)
<<
"Initialize"
;
dfg_pass
.
Initialize
(
&
argument
);
dfg_pass1
.
Initialize
(
&
argument
);
pass0
.
Initialize
(
&
argument
);
trt_pass
.
Initialize
(
&
argument
);
LOG
(
INFO
)
<<
"Run"
;
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
());
dfg_pass
.
Run
(
argument
.
main_dfg
.
get
());
...
...
paddle/fluid/inference/analysis/ut_helper.h
浏览文件 @
c13efe02
...
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/
ut_
helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -32,27 +32,12 @@ namespace analysis {
DEFINE_string
(
inference_model_dir
,
""
,
"inference test model dir"
);
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_dir
=
FLAGS_inference_model_dir
)
{
std
::
string
msg
;
std
::
string
net_file
=
FLAGS_inference_model_dir
+
"/__model__"
;
std
::
ifstream
fin
(
net_file
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
net_file
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
msg
.
resize
(
fin
.
tellg
());
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
(
msg
.
at
(
0
)),
msg
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
msg
);
return
program_desc
;
}
static
DataFlowGraph
ProgramDescToDFG
(
const
framework
::
proto
::
ProgramDesc
&
desc
)
{
DataFlowGraph
graph
;
FluidToDataFlowGraphPass
pass
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
graph
);
...
...
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
class
DFG_Tester
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
);
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
}
...
...
paddle/fluid/inference/api/api_anakin_engine_tester.cc
浏览文件 @
c13efe02
...
...
@@ -37,19 +37,21 @@ TEST(inference, anakin) {
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
PaddleTensor
tensor
{.
name
=
"input_0"
,
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
FLOAT32
};
PaddleTensor
tensor
;
tensor
.
name
=
"input_0"
;
tensor
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
;
paddle_tensor_feeds
.
emplace_back
(
std
::
move
(
tensor
));
PaddleTensor
tensor_out
{.
name
=
"prob_out"
,
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
}),
.
data
=
PaddleBuf
(),
.
dtype
=
PaddleDType
::
FLOAT32
};
PaddleTensor
tensor_out
;
tensor_out
.
name
=
"prob_out"
;
tensor_out
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
});
tensor_out
.
data
=
PaddleBuf
();
tensor_out
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
outputs
;
outputs
.
emplace_back
(
std
::
move
(
tensor_out
));
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
c13efe02
...
...
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
lod
.
emplace_back
(
level
);
}
input
.
set_lod
(
lod
);
feeds
->
push_back
(
input
);
}
return
true
;
...
...
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
buffer
.
Resize
(
sizeof
(
float
)
*
data
.
size
());
}
std
::
memcpy
(
buffer
.
data
(),
data
.
data
(),
buffer
.
length
());
// copy LoD
for
(
const
auto
&
level
:
fetchs
[
i
].
lod
())
{
outputs
->
at
(
i
).
lod
.
emplace_back
(
level
);
}
outputs
->
at
(
i
).
dtype
=
PaddleDType
::
FLOAT32
;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
c13efe02
...
...
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void
OptimizeInferenceProgram
()
{
// Analyze inference_program
Argument
argument
;
if
(
!
config_
.
model_dir
.
empty
())
{
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
config_
.
model_dir
));
}
else
{
PADDLE_ENFORCE
(
!
config_
.
param_file
.
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
.
empty
());
argument
.
fluid_model_program_path
.
reset
(
new
std
::
string
(
config_
.
prog_file
));
argument
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
argument
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
c13efe02
...
...
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
std
::
vector
<
int64_t
>
data
(
20
);
for
(
int
i
=
0
;
i
<
20
;
i
++
)
data
[
i
]
=
i
;
PaddleTensor
tensor
{
.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
10
,
1
}),
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
)),
.
dtype
=
PaddleDType
::
INT64
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
10
,
1
});
tensor
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
...
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
浏览文件 @
c13efe02
...
...
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
//# 2. Prepare input.
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
INT64
}
;
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
...
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
INT64
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
outputs
;
// 3. Run
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
c13efe02
...
...
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
file
.
close
();
// Inference.
PaddleTensor
input
{
.
name
=
"xx"
,
.
shape
=
record
.
shape
,
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
)),
.
dtype
=
PaddleDType
::
FLOAT32
}
;
PaddleTensor
input
;
input
.
shape
=
record
.
shape
;
input
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
c13efe02
...
...
@@ -67,9 +67,9 @@ struct PaddleTensor {
PaddleTensor
()
=
default
;
std
::
string
name
;
// variable name.
std
::
vector
<
int
>
shape
;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf
data
;
// blob of data.
PaddleDType
dtype
;
std
::
vector
<
std
::
vector
<
uint64_t
>>
lod
;
// lod data
};
enum
class
PaddleEngineKind
{
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
c13efe02
...
...
@@ -19,12 +19,17 @@ limitations under the License. */
#include <thread> // NOLINT
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32
(
listen_and_serv_profile_period
,
0
,
"the period of listen_and_serv to do profile"
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop(
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
rpc_service_
->
ResetBarrierCounter
();
int32_t
profile_step
=
0
;
while
(
true
)
{
PADDLE_ENFORCE_LE
(
profile_step
,
FLAGS_listen_and_serv_profile_period
,
"profile_step should not be larger then "
"FLAGS_listen_and_serv_profile_period"
);
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
0
)
{
auto
pf_state
=
paddle
::
platform
::
ProfilerState
::
kCPU
;
paddle
::
platform
::
EnableProfiler
(
pf_state
);
}
}
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
...
...
@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast
<
distributed
::
RequestSendHandler
*>
(
request_send_handler_
.
get
())
->
ResetSparseVarRecorder
();
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
FLAGS_listen_and_serv_profile_period
)
{
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kTotal
,
"/dev/null"
);
profile_step
=
0
;
}
else
{
profile_step
++
;
}
}
}
// while(true)
}
...
...
paddle/fluid/operators/math/im2col.cc
浏览文件 @
c13efe02
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
// TODO(TJ): change me to template
// further optimaze:
// 1. padding != 1
// 2. could also support stride_h != 1
if
(
stride
[
0
]
==
1
&&
stride
[
1
]
==
1
&&
dilation
[
0
]
==
1
&&
dilation
[
1
]
==
1
&&
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
int
col_matrix_width
=
output_width
*
output_height
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
oh
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_height
*
im_width
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
dilation
[
1
]
==
1
)
{
if
(
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
im2col_sh1sw1dh1dw1ph0pw0
<
T
>
(
im
,
col
);
return
;
}
else
if
(
padding
[
0
]
==
1
&&
padding
[
1
]
==
1
)
{
im2col_sh1sw1dh1dw1ph1pw1
<
T
>
(
im
,
col
);
return
;
}
// TODO(TJ): complete padding >=2
}
im2col_common
<
T
>
(
im
,
dilation
,
stride
,
padding
,
col
);
}
};
...
...
paddle/fluid/operators/math/im2col_cfo_cpu.h
0 → 100644
浏览文件 @
c13efe02
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/**
* The most common im2col algorithm.
* Support dilation, stride and padding.
*/
template
<
typename
T
>
inline
void
im2col_common
(
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
}
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 0
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph0pw0
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
col_matrix_width
=
output_width
*
output_height
;
int
im_size
=
im_height
*
im_width
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
const
T
*
im_data_oh
=
im_data
;
T
*
dst_data_oh
=
col_data
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
src_data_ic
=
im_data_oh
;
T
*
dst_data
=
dst_data_oh
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
src_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
src_data_ic
=
src_data_ic
+
im_size
;
}
im_data_oh
=
im_data_oh
+
im_width
;
dst_data_oh
=
dst_data_oh
+
output_width
;
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 1
* and filter_width == 1 have a special implementation
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph1pw1
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
constexpr
int
plh
=
1
;
constexpr
int
prh
=
1
;
constexpr
int
plw
=
1
;
constexpr
int
prw
=
1
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
im_size
=
im_height
*
im_width
;
int
col_matrix_width
=
output_width
*
output_height
;
int
col_block_fh
=
filter_width
*
col_matrix_width
;
// fw*oh*ow
int
col_block_ic
=
filter_height
*
col_block_fh
;
// fh*fw*oh*ow
// fill height padding
{
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
T
*
col_start_l
=
col_data
;
T
*
col_start_r
=
col_data
+
(
filter_height
-
1
)
*
col_block_fh
+
col_matrix_width
-
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_l
=
col_start_l
;
T
*
dst_data_r
=
col_start_r
;
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memset
(
dst_data_l
,
0
,
copy_size
);
std
::
memset
(
dst_data_r
,
0
,
copy_size
);
dst_data_l
=
dst_data_l
+
col_matrix_width
;
dst_data_r
=
dst_data_r
+
col_matrix_width
;
}
col_start_l
=
col_start_l
+
col_block_ic
;
col_start_r
=
col_start_r
+
col_block_ic
;
}
}
auto
pad
=
static_cast
<
T
>
(
0
);
if
(
filter_width
==
1
)
{
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
T
*
dst_data
=
dst_data_kh
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
-
1
;
*
dst_data
=
pad
;
++
dst_data
;
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// fill core
size_t
copy_size
=
sizeof
(
T
)
*
(
output_width
-
plw
-
prw
);
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
col_matrix_width
;
continue
;
}
std
::
memcpy
(
dst_data
+
plw
,
src_data
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
// filter_width != 1
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
T
*
dst_data
:
{
dst_data_kh
,
dst_data_kh
+
(
filter_width
-
prw
)
*
col_matrix_width
+
output_width
-
1
})
{
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
;
}
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
filter_width
*
col_matrix_width
;
continue
;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
std
::
memcpy
(
dst_data
+
(
plw
-
kw
),
src_data
,
sizeof
(
T
)
*
(
output_width
-
(
plw
-
kw
)));
dst_data
=
dst_data
+
col_matrix_width
;
}
for
(
int
kw
=
plw
;
kw
<
filter_width
-
prw
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
output_width
);
dst_data
=
dst_data
+
col_matrix_width
;
}
int
i
=
1
;
for
(
int
kw
=
filter_width
-
prw
;
kw
<
filter_width
;
++
kw
,
++
i
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
(
output_width
-
i
));
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
c13efe02
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include <sys/time.h>
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
template
<
typename
DeviceContext
,
typename
Place
>
void
testIm2col
()
{
...
...
@@ -160,82 +162,111 @@ void testIm2col() {
delete
context
;
}
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
output
;
paddle
::
framework
::
Tensor
ref_output
;
std
::
vector
<
int
>
padding
({
ph
,
pw
});
std
::
vector
<
int
>
stride
({
1
,
1
});
// stride_y, stride_x
std
::
vector
<
int
>
dilation
({
1
,
1
});
// dilation_y, dilation_x
int
output_height
=
(
ih
-
fh
+
padding
[
0
]
*
2
)
/
stride
[
0
]
+
1
;
int
output_width
=
(
iw
-
fw
+
padding
[
1
]
*
2
)
/
stride
[
1
]
+
1
;
float
*
input_ptr
=
input
.
mutable_data
<
float
>
({
ic
,
ih
,
iw
},
paddle
::
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
input
.
numel
();
++
i
)
{
input_ptr
[
i
]
=
static_cast
<
float
>
(
i
+
1
);
}
paddle
::
platform
::
CPUPlace
place
;
paddle
::
platform
::
CPUDeviceContext
context
(
place
);
output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
ref_output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CPUDeviceContext
,
float
>
im2col
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
output
);
auto
ref_im2col
=
[
&
](
const
paddle
::
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
paddle
::
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
float
*
im_data
=
im
.
data
<
float
>
();
float
*
col_data
=
col
->
data
<
float
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
0.
f
:
im_data
[
im_idx
];
}
}
}
};
ref_im2col
(
input
,
dilation
,
stride
,
padding
,
&
ref_output
);
float
*
out_cfo_ptr
=
output
.
data
<
float
>
();
float
*
out_ref_ptr
=
ref_output
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
out_ref_ptr
[
i
]);
}
}
TEST
(
math
,
im2col
)
{
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
2
,
/*ph*/
0
,
/*pw*/
0
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
1
);
#ifdef PADDLE_WITH_CUDA
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
#endif
}
#define PREPARE_IM2COL_CPU \
paddle::platform::CPUPlace place; \
paddle::platform::CPUDeviceContext context(place); \
paddle::framework::Tensor input; \
paddle::framework::Tensor out; \
paddle::framework::Tensor ref; \
std::vector<int> padding({ph, pw}); \
std::vector<int> stride({1, 1}); \
std::vector<int> dilation({1, 1}); \
float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place); \
for (int i = 0; i < input.numel(); ++i) { \
input_ptr[i] = static_cast<float>(i + 1); \
} \
int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \
int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \
out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
paddle::operators::math::Im2ColFunctor< \
paddle::operators::math::ColFormat::kCFO, \
paddle::platform::CPUDeviceContext, float> \
im2col
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
float
*
ref_data
=
ref
.
data
<
float
>
();
float
*
out_data
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_data
[
i
],
ref_data
[
i
]);
}
}
void
benchIm2col
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
constexpr
int
repeat
=
100
;
auto
GetCurrentMs
=
[]()
->
double
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+3
*
time
.
tv_sec
+
1e-3
*
time
.
tv_usec
;
};
auto
t1
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
}
auto
t2
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
}
auto
t3
=
GetCurrentMs
();
LOG
(
INFO
)
<<
"before: "
<<
(
t3
-
t2
)
/
repeat
<<
",after: "
<<
(
t2
-
t1
)
/
repeat
<<
",boost: "
<<
((
t3
-
t2
)
/
(
t2
-
t1
)
-
1
)
*
100
<<
"%"
;
}
TEST
(
math
,
im2col_cputest
)
{
// padding_h == padding_w
for
(
int
p
=
0
;
p
<
4
;
++
p
)
{
// width == height
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
4
,
/*fw*/
4
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
2
,
/*ph*/
p
,
/*pw*/
p
);
// height != width
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
// filter == 1
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
3
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
}
// padding_h != padding_w
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
2
);
// benchmark
for
(
int
p
:
{
0
,
1
})
{
for
(
int
k
:
{
1
,
3
,
5
})
{
LOG
(
INFO
)
<<
"padding == "
<<
p
<<
", filter == "
<<
k
;
benchIm2col
(
/*ic*/
3
,
/*ih*/
224
,
/*iw*/
224
,
/*fh*/
k
,
/*fw*/
k
,
/*ph*/
p
,
/*pw*/
p
);
}
}
}
paddle/fluid/operators/reshape_op.cc
浏览文件 @
c13efe02
...
...
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor). The output tensor of reshape operator."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
"(std::vector<int>) Target shape of reshape operator."
);
AddAttr
<
bool
>
(
"inplace"
,
"(default: false) Change the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x)."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Reshape Operator.
...
...
@@ -233,16 +227,9 @@ class ReshapeKernel {
"sequence_reshape op."
);
}
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
out
->
Resize
(
out_dims
);
}
}
};
...
...
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
in_dims
=
d_x
->
dims
();
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
in_dims
=
d_x
->
dims
();
if
(
!
inplace
)
{
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
ctx
.
device_context
().
Wait
();
d_x
->
Resize
(
in_dims
);
}
else
{
d_x
->
ShareDataWith
(
*
d_out
);
d_x
->
Resize
(
in_dims
);
}
framework
::
TensorCopySync
(
*
d_out
,
ctx
.
GetPlace
(),
d_x
);
d_x
->
Resize
(
in_dims
);
}
};
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
c13efe02
...
...
@@ -13,7 +13,6 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include <random>
...
...
@@ -25,13 +24,13 @@
using
paddle
::
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
paddle
::
platform
::
float16
;
#define CUDA_ATOMIC_KERNEL(op, T) \
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \
i += blockDim.x * gridDim.x) { \
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \
} \
template
<
typename
T
>
__global__
void
AddKernel
(
const
T
*
data_a
,
T
*
data_b
,
size_t
num
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
paddle
::
platform
::
CudaAtomicAdd
(
&
data_b
[
i
],
data_a
[
i
]);
}
}
template
<
typename
T
>
struct
AddFunctor
{
...
...
@@ -39,80 +38,116 @@ struct AddFunctor {
};
template
<
typename
T
>
struct
SubFunctor
{
T
operator
()(
const
T
&
a
,
const
T
&
b
)
{
return
a
-
b
;
}
};
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \
void Test##T##op(size_t num) { \
T *in1, *in2, *out; \
T *d_in1, *d_in2; \
size_t size = sizeof(T) * num; \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = reinterpret_cast<T*>(malloc(size)); \
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
void
TestCase
(
size_t
num
)
{
T
*
in1
,
*
in2
,
*
out
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
CUDA_ATOMIC_KERNEL
(
Add
,
float
);
CUDA_ATOMIC_KERNEL
(
Add
,
double
);
CUDA_ATOMIC_KERNEL
(
Add
,
float16
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
double
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float16
);
namespace
paddle
{
namespace
platform
{
USE_CUDA_ATOMIC
(
Sub
,
int
);
};
};
CUDA_ATOMIC_KERNEL
(
Sub
,
int
);
ARITHMETIC_KERNEL_LAUNCH
(
Sub
,
int
);
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
T
>
()(
in1
[
i
],
in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
// cuda primitives
TEST
(
CudaAtomic
,
Add
)
{
TestfloatAdd
(
static_cast
<
size_t
>
(
10
));
TestfloatAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
10
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TestCase
<
float
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
TEST
(
CudaAtomic
,
Sub
)
{
TestintSub
(
static_cast
<
size_t
>
(
10
));
TestintSub
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TEST
(
CudaAtomic
,
float16
)
{
using
paddle
::
platform
::
float16
;
Testfloat16Add
(
static_cast
<
size_t
>
(
1
));
Testfloat16Add
(
static_cast
<
size_t
>
(
2
));
Testfloat16Add
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
2
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
// unalignment of uint8
void
TestUnalign
(
size_t
num
,
const
int
shift_bit
)
{
PADDLE_ENFORCE
(
num
%
2
==
0
,
"must be a multiple of 2"
);
float16
*
in1
,
*
in2
,
*
out
;
float16
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
// right shift 1, mimic the unalignment of address
float16
*
r_in1
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in1
)
+
shift_bit
);
float16
*
r_in2
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in2
)
+
shift_bit
);
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
float16
>
()(
r_in1
[
i
],
r_in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
// same with float16 testcase
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
2
);
// shift the address.
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
1
);
Testfloat16Add
(
static_cast
<
size_t
>
(
10
));
Testfloat16Add
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
3
);
}
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
c13efe02
...
...
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic.
// then store the result into a uint32.
inline
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
float16
low_half
;
// the float16 in lower 16bits
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
ffff
u
);
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
FFFF
u
);
low_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
low_half
)
+
x
);
return
(
val
&
0x
ffff
0000u
)
|
low_half
.
x
;
return
(
val
&
0x
FFFF
0000u
)
|
low_half
.
x
;
}
inline
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
float16
high_half
;
// the float16 in higher 16bits
high_half
.
x
=
static_cast
<
uint16_t
>
(
val
>>
16
);
high_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
high_half
)
+
x
);
return
(
val
&
0x
ffff
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
return
(
val
&
0x
FFFF
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
}
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
size_t
>
(
address
)
&
2
));
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
uintptr_t
>
(
address
)
&
0x0
2
));
float
val_f
=
static_cast
<
float
>
(
val
);
uint32_t
old
=
*
address_as_ui
;
uint32_t
sum
;
uint32_t
newval
;
uint32_t
assumed
;
if
(((
size_t
)
address
&
2
)
==
0
)
{
if
(((
uintptr_t
)
address
&
0x0
2
)
==
0
)
{
// the float16 value stay at lower 16 bits of the address.
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ui
,
assumed
,
add_to_low_half
(
assumed
,
val_f
));
}
while
(
old
!=
assumed
);
float16
ret
;
ret
.
x
=
old
&
0x
ffff
u
;
ret
.
x
=
old
&
0x
FFFF
u
;
return
ret
;
}
else
{
// the float16 value stay at higher 16 bits of the address.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
c13efe02
...
...
@@ -534,7 +534,7 @@ EOF
make
-j
`
nproc
`
inference_lib_dist
cd
${
PADDLE_ROOT
}
/build
cp
-r
fluid_install_dir fluid
tar
-cf
fluid.tgz fluid
tar
-c
z
f
fluid.tgz fluid
fi
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
c13efe02
...
...
@@ -127,6 +127,7 @@ def __bootstrap__():
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'listen_and_serv_profile_period'
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
c13efe02
...
...
@@ -4473,15 +4473,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension."
)
helper
=
LayerHelper
(
"reshape"
,
**
locals
())
reshaped
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"reshape"
,
inputs
=
inputs
,
attrs
=
{
"shape"
:
shape
,
"inplace"
:
inplace
},
outputs
=
{
"Out"
:
reshaped
})
attrs
=
{
"shape"
:
shape
},
outputs
=
{
"Out"
:
out
})
return
helper
.
append_activation
(
reshaped
)
return
helper
.
append_activation
(
out
)
def
lod_reset
(
x
,
y
=
None
,
target_lod
=
None
):
...
...
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
浏览文件 @
c13efe02
...
...
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
print
(
str
(
result_program
))
class
TestMemoryTranspiler2
(
unittest
.
TestCase
):
def
setUp
(
self
):
program
=
Program
()
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
13
],
dtype
=
'float32'
)
fc
=
layers
.
fc
(
input
=
x
,
size
=
10
,
act
=
None
)
reshape
=
layers
.
reshape
(
x
=
fc
,
shape
=
[
-
1
,
2
,
5
])
fc
=
layers
.
reshape
(
x
=
reshape
,
shape
=
[
-
1
,
5
,
2
])
y_predict
=
layers
.
fc
(
input
=
fc
,
size
=
1
,
act
=
None
)
y
=
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
layers
.
mean
(
cost
)
opt
=
optimizer
.
SGD
(
learning_rate
=
0.001
)
opt
.
minimize
(
avg_cost
)
self
.
program
=
program
def
test_inplace_ops
(
self
):
print
(
"before optimization"
)
print
(
str
(
self
.
program
))
result_program
=
memory_optimize
(
self
.
program
)
print
(
"after optimization"
)
print
(
str
(
result_program
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_reshape_op.py
浏览文件 @
c13efe02
...
...
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
new_shape
)}
def
test_check_output
(
self
):
...
...
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
self
.
attrs
[
"shape"
])}
def
test_check_output
(
self
):
...
...
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
infered_shape
)}
def
test_check_output
(
self
):
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
c13efe02
...
...
@@ -495,6 +495,7 @@ class DistributeTranspiler(object):
pserver_index
=
self
.
pserver_endpoints
.
index
(
endpoint
)
table_opt_block
=
self
.
_create_table_optimize_block
(
pserver_index
,
pserver_program
,
pre_block_idx
,
grad_to_block_id
)
optimize_blocks
.
append
(
table_opt_block
)
prefetch_var_name_to_block_id
=
self
.
_create_prefetch_block
(
pserver_index
,
pserver_program
,
table_opt_block
)
checkpoint_block_id
=
self
.
_create_checkpoint_save_block
(
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
c13efe02
...
...
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
graphviz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录