Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5238a7f5
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5238a7f5
编写于
10月 08, 2018
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into optimize-opyreader
test=develop
上级
b2fd2158
34b81f60
变更
22
隐藏空白更改
内联
并排
Showing
22 changed file
with
242 addition
and
52 deletion
+242
-52
cmake/cblas.cmake
cmake/cblas.cmake
+1
-1
cmake/external/openblas.cmake
cmake/external/openblas.cmake
+4
-4
cmake/flags.cmake
cmake/flags.cmake
+2
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+18
-3
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+17
-0
paddle/fluid/framework/naive_executor.h
paddle/fluid/framework/naive_executor.h
+4
-0
paddle/fluid/framework/rw_lock.h
paddle/fluid/framework/rw_lock.h
+1
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+0
-2
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+0
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+6
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+8
-0
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+96
-0
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+3
-2
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+0
-2
paddle/fluid/inference/tests/book/CMakeLists.txt
paddle/fluid/inference/tests/book/CMakeLists.txt
+0
-1
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+64
-27
paddle/fluid/operators/while_op.cc
paddle/fluid/operators/while_op.cc
+6
-4
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+5
-2
paddle/fluid/train/CMakeLists.txt
paddle/fluid/train/CMakeLists.txt
+0
-1
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+4
-0
python/paddle/fluid/tests/CMakeLists.txt
python/paddle/fluid/tests/CMakeLists.txt
+1
-0
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+2
-2
未找到文件。
cmake/cblas.cmake
浏览文件 @
5238a7f5
...
...
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
/usr/local/opt/openblas/lib
)
find_path
(
OPENBLAS_INC_DIR NAMES cblas.h
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
)
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
NO_DEFAULT_PATH
)
find_path
(
OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
)
find_library
(
OPENBLAS_LIB NAMES openblas
...
...
cmake/external/openblas.cmake
浏览文件 @
5238a7f5
...
...
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})
SET
(
CBLAS_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/openblas
)
SET
(
CBLAS_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/openblas
)
SET
(
CBLAS_INC
LUDE
_DIR
"
${
CBLAS_INSTALL_DIR
}
/include"
CACHE PATH
"openblas include directory."
FORCE
)
SET
(
CBLAS_INC_DIR
"
${
CBLAS_INSTALL_DIR
}
/include"
CACHE PATH
"openblas include directory."
FORCE
)
SET
(
CBLAS_LIBRARIES
"
${
CBLAS_INSTALL_DIR
}
/lib/
${
CMAKE_STATIC_LIBRARY_PREFIX
}
openblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
"
...
...
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
ENDIF
(
NOT WIN32
)
SET
(
CBLAS_PROVIDER openblas
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
CBLAS_INC
LUDE
_DIR
}
DESTINATION third_party/openblas
)
INSTALL
(
DIRECTORY
${
CBLAS_INC_DIR
}
DESTINATION third_party/openblas
)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF
(
ANDROID
)
...
...
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
ENDIF
(
NOT
${
CBLAS_FOUND
}
)
MESSAGE
(
STATUS
"BLAS library:
${
CBLAS_LIBRARIES
}
"
)
MESSAGE
(
STATUS
"BLAS Include:
${
CBLAS_INC
LUDE
_DIR
}
"
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC
LUDE
_DIR
}
)
MESSAGE
(
STATUS
"BLAS Include:
${
CBLAS_INC_DIR
}
"
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC_DIR
}
)
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
...
...
cmake/flags.cmake
浏览文件 @
5238a7f5
...
...
@@ -157,6 +157,8 @@ if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set
(
CMAKE_OSX_ARCHITECTURES
"x86_64"
CACHE STRING
"Build architectures for OSX"
FORCE
)
endif
()
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set
(
COMMON_FLAGS -Wno-deprecated-register
)
endif
(
APPLE
)
if
(
LINUX
)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
5238a7f5
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function
(
windows_symbolic TARGET
)
...
...
@@ -9,11 +10,23 @@ function(windows_symbolic TARGET)
if
(
NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc OR NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cu
)
message
(
FATAL
"
${
src
}
.cc and
${
src
}
.cu must exsits, and
${
src
}
.cu must be symbolic file."
)
endif
()
add_custom_command
(
OUTPUT .
${
src
}
.cu
# only copy the xx.cu to .xx.cu when the content are modified
set
(
copy_flag 1
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu
)
file
(
READ
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc SOURCE_STR
)
file
(
READ
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu TARGET_STR
)
if
(
SOURCE_STR STREQUAL TARGET_STR
)
set
(
copy_flag 0
)
endif
()
endif
()
if
(
copy_flag
)
add_custom_command
(
OUTPUT .
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E remove
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E copy
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu"
COMMENT
"create hidden file of
${
src
}
.cu"
)
add_custom_target
(
${
TARGET
}
ALL DEPENDS .
${
src
}
.cu
)
endif
(
copy_flag
)
add_custom_target
(
${
TARGET
}
ALL DEPENDS .
${
src
}
.cu
)
endforeach
()
endfunction
()
...
...
@@ -81,6 +94,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if
(
WITH_GPU
)
if
(
WIN32
)
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
windows_symbolic
(
hidden_file SRCS data_type_transform.cu
)
nv_library
(
data_type_transform SRCS .data_type_transform.cu DEPS tensor
)
add_dependencies
(
data_type_transform hidden_file
)
...
...
@@ -149,7 +164,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor
op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
elementwise_add_op
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
5238a7f5
...
...
@@ -146,5 +146,22 @@ void NaiveExecutor::CleanFeedFetchOps() {
ops_
.
swap
(
ops
);
}
void
NaiveExecutor
::
EnableMKLDNN
(
const
ProgramDesc
&
program
)
{
#ifdef PADDLE_WITH_MKLDNN
VLOG
(
3
)
<<
"use_mkldnn=True"
;
for
(
size_t
block_id
=
0
;
block_id
<
program
.
Size
();
++
block_id
)
{
auto
*
block
=
const_cast
<
ProgramDesc
&>
(
program
).
MutableBlock
(
block_id
);
for
(
auto
*
op
:
block
->
AllOps
())
{
if
(
op
->
HasAttr
(
"use_mkldnn"
))
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
}
}
}
#else
LOG
(
WARNING
)
<<
"'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"
;
#endif
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor.h
浏览文件 @
5238a7f5
...
...
@@ -14,6 +14,8 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -46,6 +48,8 @@ class NaiveExecutor {
void
CleanFeedFetchOps
();
void
EnableMKLDNN
(
const
ProgramDesc
&
program
);
protected:
void
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
);
...
...
paddle/fluid/framework/rw_lock.h
浏览文件 @
5238a7f5
...
...
@@ -46,6 +46,7 @@ struct RWLock {
private:
pthread_rwlock_t
lock_
;
};
// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
#else
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing.
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
5238a7f5
...
...
@@ -20,8 +20,6 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
function
(
inference_analysis_test TARGET
)
if
(
WITH_TESTING
)
set
(
options
""
)
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
5238a7f5
...
...
@@ -31,7 +31,6 @@ function(inference_api_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
5238a7f5
...
...
@@ -71,6 +71,11 @@ bool AnalysisPredictor::Init(
}
else
{
inference_program_
=
program
;
}
if
(
config_
.
_use_mkldnn
)
{
executor_
->
EnableMKLDNN
(
*
inference_program_
);
}
executor_
->
Prepare
(
scope_
.
get
(),
*
inference_program_
,
0
,
config_
.
use_feed_fetch_ops
);
...
...
@@ -92,6 +97,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to set feed"
;
return
false
;
}
// Run the inference program
// if share variables, we need not create variables
executor_
->
Run
();
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
5238a7f5
...
...
@@ -70,6 +70,14 @@ if (NOT EXISTS ${OCR_INSTALL_DIR})
endif
()
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
# resnet50
set
(
RESNET50_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
)
if
(
NOT EXISTS
${
RESNET50_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
RESNET50_INSTALL_DIR
}
${
INFERENCE_URL
}
"resnet50_model.tar.gz"
)
endif
()
inference_analysis_test
(
test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
RESNET50_INSTALL_DIR
}
/model
)
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# anakin rnn1
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
0 → 100644
浏览文件 @
5238a7f5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
param_file
=
FLAGS_infer_model
+
"/params"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/model"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
PaddleTensor
input
;
// channel=3, height/width=318
std
::
vector
<
int
>
shape
({
FLAGS_batch_size
,
3
,
318
,
318
});
input
.
shape
=
shape
;
input
.
dtype
=
PaddleDType
::
FLOAT32
;
// fill input data, for profile easily, do not use random data here.
size_t
size
=
FLAGS_batch_size
*
3
*
318
*
318
;
input
.
data
.
Resize
(
size
*
sizeof
(
float
));
float
*
input_data
=
static_cast
<
float
*>
(
input
.
data
.
data
());
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
*
(
input_data
+
i
)
=
static_cast
<
float
>
(
i
)
/
size
;
}
std
::
vector
<
PaddleTensor
>
input_slots
;
input_slots
.
assign
({
input
});
(
*
inputs
).
emplace_back
(
input_slots
);
}
// Easy for profiling independently.
TEST
(
Analyzer_resnet50
,
profile
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
// output is a 512-dimension feature
EXPECT_EQ
(
size
,
512
*
FLAGS_batch_size
);
}
}
// Check the fuse status
TEST
(
Analyzer_resnet50
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_resnet50
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
5238a7f5
...
...
@@ -270,10 +270,11 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
}
bool
CompareTensors
(
framework
::
Scope
&
a_scope
,
framework
::
Scope
&
b_scope
,
bool
CompareTensors
(
const
framework
::
Scope
&
a_scope
,
const
framework
::
Scope
&
b_scope
,
const
std
::
vector
<
std
::
string
>
&
tensors
)
{
for
(
auto
&
x
:
tensors
)
{
auto
*
a_var
=
a_scope
.
FindVar
(
x
);
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
5238a7f5
...
...
@@ -61,8 +61,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
ir_passes
.
push_back
(
"fc_gru_fuse_pass"
);
#ifdef PADDLE_WITH_MKLDNN
cfg
->
_use_mkldnn
=
true
;
// disable mkldnn fuse since it should have some bugs
cfg
->
ir_passes
.
push_back
(
"conv_relu_mkldnn_fuse_pass"
);
#endif
}
...
...
paddle/fluid/inference/tests/book/CMakeLists.txt
浏览文件 @
5238a7f5
...
...
@@ -4,7 +4,6 @@ function(inference_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
arg_list
""
)
if
(
inference_test_ARGS
)
foreach
(
arg
${
inference_test_ARGS
}
)
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
5238a7f5
...
...
@@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
* 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value.
*/
template
<
typename
T
,
int
MaxLength
,
int
BlockSize
>
__global__
void
KeMatrixTopK
(
T
*
output
,
int
output_stride
,
int64_t
*
indices
,
const
T
*
src
,
int
lds
,
int
dim
,
int
k
)
{
const
T
*
src
,
int
lds
,
int
dim
,
int
k
,
int
grid_dim
,
int
num
)
{
__shared__
Pair
<
T
>
sh_topk
[
BlockSize
];
__shared__
int
maxid
[
BlockSize
/
2
];
const
int
tid
=
threadIdx
.
x
;
const
int
warp
=
threadIdx
.
x
/
32
;
output
+=
blockIdx
.
x
*
output_stride
;
indices
+=
blockIdx
.
x
*
k
;
Pair
<
T
>
topk
[
MaxLength
];
int
beam
=
MaxLength
;
Pair
<
T
>
max
;
bool
is_empty
=
false
;
bool
firststep
=
true
;
const
int
bid
=
blockIdx
.
x
;
for
(
int
i
=
bid
;
i
<
num
;
i
+=
grid_dim
)
{
output
+=
i
*
output_stride
;
indices
+=
i
*
k
;
Pair
<
T
>
topk
[
MaxLength
];
int
beam
=
MaxLength
;
Pair
<
T
>
max
;
bool
is_empty
=
false
;
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
src
+
i
*
lds
,
&
firststep
,
&
is_empty
,
&
max
,
dim
,
tid
);
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
sh_topk
[
tid
]
=
topk
[
0
];
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
output
,
&
indices
,
&
beam
,
&
k
,
tid
,
warp
);
}
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
src
+
blockIdx
.
x
*
lds
,
&
firststep
,
&
is_empty
,
&
max
,
dim
,
tid
);
sh_topk
[
tid
]
=
topk
[
0
];
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
output
,
&
indices
,
&
beam
,
&
k
,
tid
,
warp
);
}
inline
static
int
GetDesiredBlockDim
(
int
dim
)
{
if
(
dim
>
128
)
{
return
256
;
}
else
if
(
dim
>
64
)
{
return
128
;
}
else
if
(
dim
>
32
)
{
return
64
;
}
else
{
return
32
;
}
}
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template
<
typename
T
>
class
TopkOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -310,18 +339,26 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
// NOTE: pass lds and dim same to input width.
// NOTE: old matrix implementation of stride is different to eigen.
// TODO(typhoonzero): refine this kernel.
dim3
threads
(
256
,
1
);
dim3
grid
(
input_height
,
1
);
KeMatrixTopK
<
T
,
5
,
256
><<<
grid
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
()
>>>
(
output_data
,
output
->
dims
()[
1
],
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
));
const
int
kMaxHeight
=
2048
;
int
gridx
=
input_height
<
kMaxHeight
?
input_height
:
kMaxHeight
;
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
switch
(
GetDesiredBlockDim
(
input_width
))
{
FIXED_BLOCK_DIM
(
KeMatrixTopK
<
T
,
5
,
kBlockDim
><<<
gridx
,
kBlockDim
,
0
,
dev_ctx
.
stream
()
>>>
(
output_data
,
output
->
dims
()[
1
],
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
),
gridx
,
input_height
));
default:
PADDLE_THROW
(
"Error"
);
}
}
};
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
}
// namespace operators
}
// namespace paddle
...
...
paddle/fluid/operators/while_op.cc
浏览文件 @
5238a7f5
...
...
@@ -224,10 +224,12 @@ class WhileGradOp : public framework::OperatorBase {
if
(
cur_scope_iter
==
step_scopes
->
rbegin
())
{
auto
*
var
=
(
*
cur_scope_iter
)
->
FindVar
(
inside_grad_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Can not find var %s"
,
inside_grad_name
);
PADDLE_ENFORCE
(
var
->
IsType
<
framework
::
LoDTensorArray
>
()
||
var
->
IsType
<
LoDTensor
>
(),
"Currently the type of var only can be LoDTensorArray "
"or LoDTensor."
);
PADDLE_ENFORCE
(
var
->
IsType
<
framework
::
LoDTensorArray
>
()
||
var
->
IsType
<
LoDTensor
>
(),
"Currently the type of var only can be LoDTensorArray, "
"or LoDTensor, but the received var[%s] is %s."
,
inside_grad_name
,
var
->
Type
().
name
());
if
(
var
->
IsType
<
LoDTensor
>
())
{
auto
&
inside_tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
5238a7f5
...
...
@@ -20,8 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
DEFINE_double
(
fraction_of_gpu_memory_to_use
,
0.92
,
"Default use 92% of GPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, "
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
namespace
paddle
{
namespace
platform
{
...
...
paddle/fluid/train/CMakeLists.txt
浏览文件 @
5238a7f5
...
...
@@ -4,7 +4,6 @@ function(train_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
train_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
arg_list
""
)
if
(
train_test_ARGS
)
foreach
(
arg
${
train_test_ARGS
}
)
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
5238a7f5
...
...
@@ -1570,6 +1570,10 @@ class DynamicRNN(object):
The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
get the output sequence.
NOTES:
Currently it is not supported that setting is_sparse to True of any
layers within DynamicRNN.
"""
BEFORE_RNN
=
0
IN_RNN
=
1
...
...
python/paddle/fluid/tests/CMakeLists.txt
浏览文件 @
5238a7f5
set
(
PYTHON_TESTS_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
CACHE PATH
"python tests directory"
)
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
5238a7f5
...
...
@@ -124,7 +124,7 @@ class InferenceTranspiler(object):
next_op
=
self
.
block
.
ops
[
i
+
1
]
if
next_op
.
type
==
'relu'
:
# modify bnorm OP to include relu
current_op
.
set_attr
(
"fuse_relu"
,
True
)
current_op
.
_
set_attr
(
"fuse_relu"
,
True
)
# remove relu OP
self
.
block
.
_remove_op
(
i
+
1
)
i
=
i
+
1
...
...
@@ -454,7 +454,7 @@ class InferenceTranspiler(object):
:type eltwise_op: Operator
'''
conv_op
.
set_attr
(
"fuse_eltwise"
,
True
)
conv_op
.
_
set_attr
(
"fuse_eltwise"
,
True
)
self
.
input_map
[
conv_op
.
output
(
"Output"
)[
0
]]
=
eltwise_op
.
input
(
"Y"
)[
0
]
self
.
input_map
[
eltwise_op
.
output
(
"Out"
)[
0
]]
=
eltwise_op
.
input
(
"Y"
)[
0
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录