Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
e1aab593
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e1aab593
编写于
3月 06, 2020
作者:
X
xiaogang
提交者:
GitHub
3月 06, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Develop nlp patch (#3059)
* fix: fix nlp ops input and output type * fix: add elementwise x_dims>y_dims case
上级
a9d17eef
变更
23
隐藏空白更改
内联
并排
Showing
23 changed file
with
502 addition
and
138 deletion
+502
-138
lite/api/CMakeLists.txt
lite/api/CMakeLists.txt
+11
-0
lite/api/ocr_attention_test.cc
lite/api/ocr_attention_test.cc
+10
-10
lite/api/transform_test.cc
lite/api/transform_test.cc
+69
-68
lite/backends/arm/math/beam_search.cc
lite/backends/arm/math/beam_search.cc
+4
-4
lite/backends/arm/math/elementwise.h
lite/backends/arm/math/elementwise.h
+151
-1
lite/backends/arm/math/increment.cc
lite/backends/arm/math/increment.cc
+1
-12
lite/backends/arm/math/increment.h
lite/backends/arm/math/increment.h
+8
-3
lite/backends/arm/math/scale.h
lite/backends/arm/math/scale.h
+19
-1
lite/backends/arm/math/topk.cc
lite/backends/arm/math/topk.cc
+2
-2
lite/backends/arm/math/topk.h
lite/backends/arm/math/topk.h
+1
-1
lite/core/op_registry.cc
lite/core/op_registry.cc
+1
-0
lite/core/op_registry.h
lite/core/op_registry.h
+3
-0
lite/core/tensor.cc
lite/core/tensor.cc
+1
-1
lite/kernels/arm/beam_search_decode_compute.cc
lite/kernels/arm/beam_search_decode_compute.cc
+6
-6
lite/kernels/arm/compare_compute.cc
lite/kernels/arm/compare_compute.cc
+47
-0
lite/kernels/arm/compare_compute.h
lite/kernels/arm/compare_compute.h
+11
-0
lite/kernels/arm/elementwise_compute.cc
lite/kernels/arm/elementwise_compute.cc
+71
-17
lite/kernels/arm/fill_constant_batch_size_like_compute.cc
lite/kernels/arm/fill_constant_batch_size_like_compute.cc
+6
-0
lite/kernels/arm/fill_constant_compute.cc
lite/kernels/arm/fill_constant_compute.cc
+6
-0
lite/kernels/arm/increment_compute.cc
lite/kernels/arm/increment_compute.cc
+16
-4
lite/kernels/arm/lookup_table_compute.cc
lite/kernels/arm/lookup_table_compute.cc
+2
-2
lite/kernels/arm/topk_compute.cc
lite/kernels/arm/topk_compute.cc
+1
-1
lite/operators/elementwise_ops.cc
lite/operators/elementwise_ops.cc
+55
-5
未找到文件。
lite/api/CMakeLists.txt
浏览文件 @
e1aab593
...
...
@@ -230,6 +230,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=
${
CMAKE_SOURCE_DIR
}
/lite/backends/opencl
--model_dir=
${
LITE_MODEL_DIR
}
/inception_v4 SERIAL
)
add_dependencies
(
test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz
)
# brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
...
...
@@ -378,6 +379,16 @@ if(NOT IOS)
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
CUDA_DEPS
${
cuda_kernels
}
)
lite_cc_binary
(
test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${
ops
}
${
host_kernels
}
ARM_DEPS
${
arm_kernels
}
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
CUDA_DEPS
${
cuda_kernels
}
)
endif
()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
...
...
lite/api/ocr_attention_test.cc
浏览文件 @
e1aab593
...
...
@@ -32,18 +32,10 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
predictor
.
Build
(
FLAGS_model_dir
,
""
,
""
,
valid_places
);
auto
*
input_tensor
=
predictor
.
GetInput
(
0
);
input_tensor
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
1
,
48
,
512
})));
auto
*
data
=
input_tensor
->
mutable_data
<
float
>
();
auto
item_size
=
input_tensor
->
dims
().
production
();
for
(
int
i
=
0
;
i
<
item_size
;
i
++
)
{
data
[
i
]
=
1
;
}
auto
*
init_scores
=
predictor
.
GetInput
(
2
);
init_scores
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
1
})));
auto
*
data_scores
=
init_scores
->
mutable_data
<
float
>
();
auto
scores_size
=
in
put_tensor
->
dims
().
production
();
auto
scores_size
=
in
it_scores
->
dims
().
production
();
for
(
int
i
=
0
;
i
<
scores_size
;
i
++
)
{
data_scores
[
i
]
=
0
;
}
...
...
@@ -53,7 +45,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
auto
*
init_ids
=
predictor
.
GetInput
(
1
);
init_ids
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
1
})));
auto
*
data_ids
=
init_ids
->
mutable_data
<
floa
t
>
();
auto
*
data_ids
=
init_ids
->
mutable_data
<
int64_
t
>
();
auto
ids_size
=
init_ids
->
dims
().
production
();
for
(
int
i
=
0
;
i
<
ids_size
;
i
++
)
{
data_ids
[
i
]
=
0
;
...
...
@@ -62,6 +54,13 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
std
::
vector
<
std
::
vector
<
uint64_t
>>
lod_i
{{
0
,
1
},
{
0
,
1
}};
*
lod_ids
=
lod_i
;
auto
*
input_tensor
=
predictor
.
GetInput
(
0
);
input_tensor
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
1
,
48
,
512
})));
auto
*
data
=
input_tensor
->
mutable_data
<
float
>
();
auto
item_size
=
input_tensor
->
dims
().
production
();
for
(
int
i
=
0
;
i
<
item_size
;
i
++
)
{
data
[
i
]
=
1
;
}
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
{
predictor
.
Run
();
}
...
...
@@ -102,6 +101,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
TEST
(
OcrAttention
,
test_arm
)
{
std
::
vector
<
Place
>
valid_places
({
Place
{
TARGET
(
kARM
),
PRECISION
(
kInt64
)},
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
});
...
...
lite/api/transform_test.cc
浏览文件 @
e1aab593
...
...
@@ -28,11 +28,10 @@ DEFINE_int32(batch, 1, "batch");
namespace
paddle
{
namespace
lite
{
namespace
test_transformer
{
namespace
test_transformer
{
std
::
vector
<
std
::
string
>
inputed_lines
;
void
LoadInputLines
(
const
char
*
filename
)
{
void
load_input_lines
(
const
char
*
filename
)
{
static
const
int
max_line_buf_size
=
100
*
1024
*
1024
;
char
*
line_buffer
=
(
char
*
)
calloc
(
max_line_buf_size
,
sizeof
(
char
));
// NOLINT
FILE
*
input_file
=
fopen
(
filename
,
"r"
);
...
...
@@ -49,7 +48,7 @@ void LoadInputLines(const char* filename) {
line_buffer
=
NULL
;
fclose
(
input_file
);
}
void
S
plit2
(
const
std
::
string
&
main_str
,
void
s
plit2
(
const
std
::
string
&
main_str
,
std
::
vector
<
std
::
string
>&
str_list
,
// NOLINT
const
std
::
string
&
delimiter
)
{
size_t
pre_pos
=
0
;
...
...
@@ -75,19 +74,19 @@ void Split2(const std::string& main_str,
}
}
// NOLINT
void
PadBatchI
nput
(
std
::
vector
<
std
::
string
>&
input_lines
,
// NOLINT
int
pad_idx
,
int
n_head
,
Tensor
*
src_word
,
Tensor
*
src_pos
,
Tensor
*
src_attn_bias
,
Tensor
*
trg_word
,
Tensor
*
init_scores
,
Tensor
*
init_idx
,
Tensor
*
trg_bias
,
int
line_start
,
int
batch_size
,
int
bos_idx
)
{
void
pad_batch_i
nput
(
std
::
vector
<
std
::
string
>&
input_lines
,
// NOLINT
int
pad_idx
,
int
n_head
,
Tensor
*
src_word
,
Tensor
*
src_pos
,
Tensor
*
src_attn_bias
,
Tensor
*
trg_word
,
Tensor
*
init_scores
,
Tensor
*
init_idx
,
Tensor
*
trg_bias
,
int
line_start
,
int
batch_size
,
int
bos_idx
)
{
int
max_len
=
0
;
int
max_line
=
input_lines
.
size
();
...
...
@@ -98,27 +97,27 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
std
::
vector
<
std
::
string
>
split_str
;
test_transformer
::
S
plit2
(
cur_line
,
split_str
,
" "
);
test_transformer
::
s
plit2
(
cur_line
,
split_str
,
" "
);
batch_lines
.
push_back
(
split_str
);
max_len
=
max_len
>=
split_str
.
size
()
?
max_len
:
split_str
.
size
();
}
src_word
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
max_len
,
1
}));
src_pos
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
max_len
,
1
}));
src_word
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
max_len
}));
src_pos
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
max_len
}));
src_attn_bias
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
n_head
,
max_len
,
max_len
}));
trg_bias
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
n_head
,
1
,
max_len
}));
float
*
src_word_data
=
src_word
->
mutable_data
<
floa
t
>
();
float
*
src_pos_data
=
src_pos
->
mutable_data
<
floa
t
>
();
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
n_head
,
max_len
,
max_len
}));
auto
*
src_word_data
=
src_word
->
mutable_data
<
int64_
t
>
();
auto
*
src_pos_data
=
src_pos
->
mutable_data
<
int64_
t
>
();
float
*
src_bias_data
=
src_attn_bias
->
mutable_data
<
float
>
();
float
*
trg_bias_data
=
trg_bias
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
std
::
vector
<
std
::
string
>
cur_words
=
batch_lines
[
i
];
int
fill_len
=
cur_words
.
size
();
int
src_bias_start
=
i
*
n_head
*
max_len
*
max_len
;
int
trg_bias_start
=
i
*
n_head
*
max_len
;
int
trg_bias_start
=
i
*
n_head
*
max_len
*
max_len
;
for
(
int
j
=
0
;
j
<
fill_len
;
++
j
)
{
src_word_data
[
i
*
max_len
+
j
]
=
(
atoi
(
cur_words
[
j
].
c_str
()));
src_pos_data
[
i
*
max_len
+
j
]
=
j
;
...
...
@@ -137,22 +136,24 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
int
value_ind
=
j
%
max_len
+
src_bias_start
;
src_bias_data
[
j
]
=
src_bias_data
[
value_ind
];
}
for
(
int
j
=
trg_bias_start
;
j
<
trg_bias_start
+
n_head
*
max_len
;
++
j
)
{
for
(
int
j
=
trg_bias_start
;
j
<
trg_bias_start
+
n_head
*
max_len
*
max_len
;
++
j
)
{
int
value_ind
=
j
%
max_len
+
trg_bias_start
;
trg_bias_data
[
j
]
=
trg_bias_data
[
value_ind
];
}
}
trg_word
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
1
,
1
}));
auto
*
trg_word_data
=
trg_word
->
mutable_data
<
floa
t
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
trg_word
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
max_len
}));
auto
*
trg_word_data
=
trg_word
->
mutable_data
<
int64_
t
>
();
for
(
int
i
=
0
;
i
<
batch_size
*
max_len
;
++
i
)
{
trg_word_data
[
i
]
=
bos_idx
;
}
init_scores
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
,
1
}));
init_idx
->
Resize
(
std
::
vector
<
DDim
::
value_type
>
({
batch_size
}));
float
*
score_data
=
init_scores
->
mutable_data
<
float
>
();
float
*
idx_data
=
init_idx
->
mutable_data
<
floa
t
>
();
auto
*
idx_data
=
init_idx
->
mutable_data
<
int32_
t
>
();
for
(
int
i
=
0
;
i
<
init_scores
->
numel
();
++
i
)
{
score_data
[
i
]
=
0
;
}
...
...
@@ -175,21 +176,25 @@ void PadBatchInput(std::vector<std::string>& input_lines, // NOLINT
void
TestModel
(
const
std
::
vector
<
Place
>&
valid_places
,
const
Place
&
preferred_place
,
bool
use_npu
=
false
)
{
#ifdef LITE_WITH_ARM
DeviceInfo
::
Init
();
DeviceInfo
::
Global
().
SetRunMode
(
lite_api
::
LITE_POWER_HIGH
,
FLAGS_threads
);
#endif
lite
::
Predictor
predictor
;
std
::
string
test_data_path
=
FLAGS_input
;
predictor
.
Build
(
FLAGS_model_dir
,
""
,
""
,
preferred_place
,
valid_places
);
predictor
.
Build
(
""
,
FLAGS_model_dir
+
"/__model__"
,
FLAGS_model_dir
+
"/weights"
,
valid_places
);
// predictor.Build(FLAGS_model_dir, "", "", valid_places);
int
n_head
=
8
;
int
batch_size
=
FLAGS_batch
;
int
bos_idx
=
0
;
int
eos_idx
=
1
;
LOG
(
INFO
)
<<
"reading"
;
test_transformer
::
LoadInputLines
(
test_data_path
.
c_str
());
LOG
(
INFO
)
<<
"reading finished"
;
test_transformer
::
load_input_lines
(
test_data_path
.
c_str
());
auto
*
trg_bias
=
predictor
.
GetInput
(
6
);
auto
*
src_word
=
predictor
.
GetInput
(
0
);
...
...
@@ -205,28 +210,31 @@ void TestModel(const std::vector<Place>& valid_places,
auto
start
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
auto
start_i
=
GetCurrentUS
();
PadBatchInput
(
test_transformer
::
inputed_lines
,
eos_idx
,
n_head
,
src_word
,
// src_word
src_pos
,
// src_pos
src_bias
,
// src_bias
trg_word
,
// trg_word
init_score
,
// init_score
init_idx
,
// init_idx
trg_bias
,
// trg_bias
i
*
batch_size
,
batch_size
,
bos_idx
);
LOG
(
INFO
)
<<
"src_word:"
<<
src_word
->
dims
();
auto
start_ii
=
GetCurrentUS
();
LOG
(
INFO
)
<<
i
<<
"->ii:"
<<
(
start_ii
-
start_i
)
/
1000.0
;
pad_batch_input
(
test_transformer
::
inputed_lines
,
eos_idx
,
n_head
,
src_word
,
// src_word
src_pos
,
// src_pos
src_bias
,
// src_bias
trg_word
,
// trg_word
init_score
,
// init_score
init_idx
,
// init_idx
trg_bias
,
// trg_bias
i
*
batch_size
,
batch_size
,
bos_idx
);
predictor
.
Run
();
auto
start_iii
=
GetCurrentUS
();
LOG
(
INFO
)
<<
i
<<
"->iii:"
<<
(
start_iii
-
start_ii
)
/
1000.0
;
auto
*
outs
=
predictor
.
GetOutputs
();
LOG
(
INFO
)
<<
"out:"
<<
(
*
outs
)[
0
].
dims
();
auto
*
outs
=
predictor
.
GetOutput
(
0
);
auto
o_data
=
outs
->
data
<
int64_t
>
();
auto
lod
=
outs
->
lod
();
for
(
int
i
=
0
;
i
<
outs
->
numel
();
++
i
)
{
LOG
(
INFO
)
<<
o_data
[
i
];
}
for
(
int
i
=
0
;
i
<
lod
.
size
();
++
i
)
{
for
(
int
j
=
0
;
j
<
lod
[
i
].
size
();
++
j
)
{
LOG
(
INFO
)
<<
lod
[
i
][
j
];
}
}
}
LOG
(
INFO
)
<<
"================== Speed Report ==================="
;
...
...
@@ -234,25 +242,18 @@ void TestModel(const std::vector<Place>& valid_places,
<<
", warmup: "
<<
FLAGS_warmup
<<
", repeats: "
<<
FLAGS_repeats
<<
", spend "
<<
(
GetCurrentUS
()
-
start
)
/
FLAGS_repeats
/
1000.0
<<
" ms in average."
;
auto
*
outs
=
predictor
.
GetOutputs
();
for
(
auto
out
:
*
outs
)
{
LOG
(
INFO
)
<<
"======"
<<
"here"
;
LOG
(
INFO
)
<<
out
;
}
LOG
(
INFO
)
<<
"======"
<<
"hereggg"
;
}
TEST
(
OcrAttention
,
test_arm
)
{
}
// namespace lite
}
// namespace paddle
using
namespace
paddle
::
lite
;
// NOLINT
int
main
(
int
argc
,
char
**
argv
)
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
std
::
vector
<
Place
>
valid_places
({
Place
{
TARGET
(
k
Host
),
PRECISION
(
kFloat
)},
Place
{
TARGET
(
k
ARM
),
PRECISION
(
kInt64
)},
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
Place
{
TARGET
(
kHost
),
PRECISION
(
kFloat
)},
});
TestModel
(
valid_places
,
Place
({
TARGET
(
kARM
),
PRECISION
(
kFloat
)}));
}
}
// namespace lite
}
// namespace paddle
lite/backends/arm/math/beam_search.cc
浏览文件 @
e1aab593
...
...
@@ -70,7 +70,7 @@ void PruneEndBeams(const Tensor *pre_ids,
std
::
vector
<
std
::
vector
<
Item
>>
*
items
,
size_t
lod_level
,
int
end_id
)
{
auto
*
pre_ids_data
=
pre_ids
->
data
<
floa
t
>
();
auto
*
pre_ids_data
=
pre_ids
->
data
<
int64_
t
>
();
auto
&
high_level
=
abs_lod
[
lod_level
];
for
(
size_t
src_idx
=
0
;
src_idx
<
high_level
.
size
()
-
1
;
++
src_idx
)
{
size_t
src_prefix_start
=
high_level
[
src_idx
];
...
...
@@ -152,10 +152,10 @@ std::vector<std::vector<Item>> SelectTopBeamSizeItems(const Tensor *pre_ids,
// find the current candidates
// auto abs_lod = framework::ToAbsOffset(scores->lod());
auto
abs_lod
=
scores
->
lod
();
auto
*
pre_ids_data
=
pre_ids
->
data
<
floa
t
>
();
auto
*
pre_ids_data
=
pre_ids
->
data
<
int64_
t
>
();
auto
*
pre_scores_data
=
pre_scores
->
data
<
float
>
();
auto
*
ids_data
=
ids
?
ids
->
data
<
int
>
()
:
nullptr
;
auto
*
ids_data
=
ids
?
ids
->
data
<
int
64_t
>
()
:
nullptr
;
auto
*
scores_data
=
scores
->
data
<
float
>
();
size_t
num_seqs
=
abs_lod
[
lod_level
].
size
()
-
1
;
...
...
@@ -236,7 +236,7 @@ void beam_search(const Tensor *pre_ids,
if
(
parent_idx
)
{
parent_idx
->
Resize
(
dims
);
}
auto
*
selected_ids_data
=
selected_ids
->
mutable_data
<
floa
t
>
();
auto
*
selected_ids_data
=
selected_ids
->
mutable_data
<
int64_
t
>
();
auto
*
selected_scores_data
=
selected_scores
->
mutable_data
<
float
>
();
auto
*
parent_idx_data
=
parent_idx
?
parent_idx
->
mutable_data
<
int
>
()
:
nullptr
;
...
...
lite/backends/arm/math/elementwise.h
浏览文件 @
e1aab593
...
...
@@ -13,11 +13,161 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
template
<
typename
T
>
void
elementwise_broadcast_common
(
T
const
*
x_data
,
T
const
*
y_data
,
T
*
out_data
,
std
::
vector
<
int64_t
>
x_real_dim
,
std
::
vector
<
int64_t
>
y_real_dim
,
std
::
vector
<
int64_t
>
out_real_dim
,
std
::
string
type
,
bool
is_xsize_large
=
false
)
{
int
out_size
=
1
;
int
max_dim
=
out_real_dim
.
size
();
std
::
vector
<
int
>
index_array
(
max_dim
,
0
);
for
(
int
i
=
0
;
i
<
max_dim
;
++
i
)
{
out_size
*=
out_real_dim
[
i
];
}
int
x_index
,
y_index
;
for
(
int
out_index
=
0
;
out_index
<
out_size
;
++
out_index
)
{
x_index
=
0
;
for
(
int
i
=
0
;
i
<
max_dim
;
i
++
)
{
if
(
x_real_dim
[
i
]
>
1
)
{
x_index
=
x_index
*
x_real_dim
[
i
]
+
index_array
[
i
];
}
}
y_index
=
0
;
for
(
int
i
=
0
;
i
<
max_dim
;
i
++
)
{
if
(
y_real_dim
[
i
]
>
1
)
{
y_index
=
y_index
*
y_real_dim
[
i
]
+
index_array
[
i
];
}
}
if
(
type
==
"add"
)
{
out_data
[
out_index
]
=
x_data
[
x_index
]
+
y_data
[
y_index
];
}
if
(
type
==
"mul"
)
{
out_data
[
out_index
]
=
x_data
[
x_index
]
*
y_data
[
y_index
];
}
}
for
(
int
i
=
max_dim
-
1
;
i
>=
0
;
--
i
)
{
++
index_array
[
i
];
if
(
index_array
[
i
]
>=
out_real_dim
[
i
])
{
index_array
[
i
]
-=
out_real_dim
[
i
];
}
else
{
break
;
}
}
}
template
<
typename
dtype
>
void
elementwise_compute_basic
(
const
operators
::
ElementwiseParam
&
param
,
const
std
::
string
elt_type
,
const
std
::
string
act_type
)
{
const
dtype
*
x_data
=
param
.
X
->
data
<
const
dtype
>
();
const
dtype
*
y_data
=
param
.
Y
->
data
<
const
dtype
>
();
dtype
*
out_data
=
param
.
Out
->
mutable_data
<
dtype
>
();
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
axis
=
param
.
axis
;
if
(
axis
<
0
)
{
axis
=
x_dims
.
size
()
-
y_dims
.
size
();
}
int
batch
=
1
;
int
channels
=
1
;
int
num
=
1
;
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
batch
*=
x_dims
[
i
];
}
for
(
int
i
=
0
;
i
<
y_dims
.
size
();
++
i
)
{
channels
*=
y_dims
[
i
];
}
for
(
int
i
=
y_dims
.
size
()
+
axis
;
i
<
x_dims
.
size
();
++
i
)
{
num
*=
x_dims
[
i
];
}
// do elementwise add/sub/max...
if
(
elt_type
==
"add"
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
int
offset
=
(
i
*
channels
+
j
)
*
num
;
const
dtype
*
din_ptr
=
x_data
+
offset
;
const
dtype
diny_data
=
y_data
[
j
];
dtype
*
dout_ptr
=
out_data
+
offset
;
for
(
int
k
=
0
;
k
<
num
;
++
k
)
{
*
dout_ptr
=
*
din_ptr
+
diny_data
;
dout_ptr
++
;
din_ptr
++
;
}
}
}
}
else
if
(
elt_type
==
"sub"
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
int
offset
=
(
i
*
channels
+
j
)
*
num
;
const
dtype
*
din_ptr
=
x_data
+
offset
;
const
dtype
diny_data
=
y_data
[
j
];
dtype
*
dout_ptr
=
out_data
+
offset
;
for
(
int
k
=
0
;
k
<
num
;
++
k
)
{
*
dout_ptr
=
*
din_ptr
-
diny_data
;
dout_ptr
++
;
}
}
}
}
else
if
(
elt_type
==
"mul"
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
int
offset
=
(
i
*
channels
+
j
)
*
num
;
const
dtype
*
din_ptr
=
x_data
+
offset
;
const
dtype
diny_data
=
y_data
[
j
];
dtype
*
dout_ptr
=
out_data
+
offset
;
for
(
int
k
=
0
;
k
<
num
;
++
k
)
{
*
dout_ptr
=
*
din_ptr
*
diny_data
;
dout_ptr
++
;
din_ptr
++
;
}
}
}
}
else
if
(
elt_type
==
"max"
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
int
offset
=
(
i
*
channels
+
j
)
*
num
;
const
dtype
*
din_ptr
=
x_data
+
offset
;
const
dtype
diny_data
=
y_data
[
j
];
dtype
*
dout_ptr
=
out_data
+
offset
;
for
(
int
k
=
0
;
k
<
num
;
++
k
)
{
*
dout_ptr
=
std
::
max
(
*
din_ptr
,
diny_data
);
dout_ptr
++
;
din_ptr
++
;
}
}
}
}
else
{
LOG
(
FATAL
)
<<
"unsupported Elementwise type: "
<<
elt_type
;
}
// do activation relu/sigmod...
if
(
act_type
.
size
()
>
0
)
{
if
(
act_type
==
"relu"
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
dtype
*
dout_ptr
=
out_data
+
(
i
*
channels
+
j
)
*
num
;
for
(
int
k
=
0
;
k
<
num
;
++
k
)
{
*
dout_ptr
=
*
dout_ptr
>
0.0
f
?
*
dout_ptr
:
0.0
f
;
dout_ptr
++
;
}
}
}
}
else
{
LOG
(
FATAL
)
<<
"unsupported Activation type: "
<<
elt_type
;
}
}
}
template
<
typename
T
>
void
elementwise_add
(
const
T
*
dinx
,
const
T
*
diny
,
T
*
dout
,
int
num
);
...
...
lite/backends/arm/math/increment.cc
浏览文件 @
e1aab593
...
...
@@ -20,18 +20,7 @@
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
void
increment
(
const
float
*
input
,
const
int
n
,
const
float
step
,
float
*
out
,
Context
<
TARGET
(
kARM
)
>*
ctx
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
out
[
i
]
=
input
[
i
]
+
step
;
}
}
}
// namespace math
namespace
math
{}
// namespace math
}
// namespace arm
}
// namespace lite
}
// namespace paddle
lite/backends/arm/math/increment.h
浏览文件 @
e1aab593
...
...
@@ -21,11 +21,16 @@ namespace paddle {
namespace
lite
{
namespace
arm
{
namespace
math
{
void
increment
(
const
float
*
input
,
template
<
typename
T
>
void
increment
(
const
T
*
input
,
const
int
n
,
const
float
step
,
float
*
out
,
Context
<
TARGET
(
kARM
)
>*
ctx
);
T
*
out
,
Context
<
TARGET
(
kARM
)
>*
ctx
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
out
[
i
]
=
input
[
i
]
+
static_cast
<
T
>
(
step
);
}
}
}
// namespace math
}
// namespace arm
...
...
lite/backends/arm/math/scale.h
浏览文件 @
e1aab593
...
...
@@ -13,12 +13,30 @@
// limitations under the License.
#pragma once
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
template
<
typename
dtype
>
void
scale_compute_basic
(
const
operators
::
ScaleParam
&
param
)
{
const
dtype
*
x_data
=
param
.
x
->
data
<
dtype
>
();
dtype
*
output_data
=
param
.
output
->
mutable_data
<
dtype
>
();
DDim
x_dims
=
param
.
x
->
dims
();
DDim
output_dims
=
param
.
output
->
dims
();
bool
bias_after_scale
=
param
.
bias_after_scale
;
float
scale
=
param
.
scale
;
float
bias
=
param
.
bias
;
if
(
!
bias_after_scale
)
{
bias
*=
scale
;
}
for
(
int
i
=
0
;
i
<
output_dims
.
production
();
i
++
)
{
output_data
[
i
]
=
static_cast
<
dtype
>
(
x_data
[
i
]
*
scale
+
bias
);
}
}
template
<
typename
T
>
void
scale
(
const
T
*
din
,
T
*
dout
,
int
num
,
T
scale
,
T
bias
);
...
...
lite/backends/arm/math/topk.cc
浏览文件 @
e1aab593
...
...
@@ -26,7 +26,7 @@ bool comp_func(std::pair<float, int> a, std::pair<float, int> b) {
void
topk
(
const
float
*
in_data
,
float
*
out_val
,
int
*
out_ind
,
int
64_t
*
out_ind
,
int
m
,
int
n
,
int
k
,
...
...
@@ -34,7 +34,7 @@ void topk(const float* in_data,
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
const
float
*
in_tmp
=
in_data
+
i
*
n
;
float
*
out_val_tmp
=
out_val
+
i
*
k
;
int
*
out_ind_tmp
=
out_ind
+
i
*
k
;
int
64_t
*
out_ind_tmp
=
out_ind
+
i
*
k
;
std
::
vector
<
std
::
pair
<
float
,
int
>>
vec
;
for
(
int
j
=
0
;
j
<
n
;
j
++
)
{
vec
.
push_back
(
std
::
make_pair
(
in_tmp
[
j
],
j
));
...
...
lite/backends/arm/math/topk.h
浏览文件 @
e1aab593
...
...
@@ -22,7 +22,7 @@ namespace math {
void
topk
(
const
float
*
din
,
float
*
out_val
,
int
*
out_ind
,
int
64_t
*
out_ind
,
int
m
,
int
n
,
int
k
,
...
...
lite/core/op_registry.cc
浏览文件 @
e1aab593
...
...
@@ -158,6 +158,7 @@ KernelRegistry::KernelRegistry()
INIT_FOR
(
kARM
,
kAny
,
kNCHW
);
INIT_FOR
(
kARM
,
kAny
,
kAny
);
INIT_FOR
(
kARM
,
kInt32
,
kNCHW
);
INIT_FOR
(
kARM
,
kInt64
,
kNCHW
);
INIT_FOR
(
kOpenCL
,
kFloat
,
kNCHW
);
INIT_FOR
(
kOpenCL
,
kFloat
,
kNHWC
);
...
...
lite/core/op_registry.h
浏览文件 @
e1aab593
...
...
@@ -147,6 +147,9 @@ class KernelRegistry final {
KernelRegistryForTarget
<
TARGET
(
kARM
),
PRECISION
(
kInt8
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kARM
),
PRECISION
(
kInt64
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kARM
),
PRECISION
(
kInt32
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
...
...
lite/core/tensor.cc
浏览文件 @
e1aab593
...
...
@@ -82,7 +82,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
target_
=
other
.
target_
;
lod_
=
other
.
lod_
;
memory_size_
=
other
.
memory_size_
;
precision_
=
other
.
precision
_
;
precision_
=
other
.
precision
()
;
buffer_
->
CopyDataFrom
(
*
other
.
buffer_
,
memory_size_
);
}
...
...
lite/kernels/arm/beam_search_decode_compute.cc
浏览文件 @
e1aab593
...
...
@@ -38,7 +38,7 @@ const size_t kSentenceLevel = 1;
template
<
typename
T
>
struct
Sentence
{
std
::
vector
<
floa
t
>
word_ids
;
std
::
vector
<
int64_
t
>
word_ids
;
std
::
vector
<
T
>
scores
;
};
...
...
@@ -73,7 +73,7 @@ struct BeamSearchDecoder {
std
::
vector
<
uint64_t
>
source_level_lod
=
{
0
};
std
::
vector
<
uint64_t
>
sentence_level_lod
=
{
0
};
std
::
vector
<
floa
t
>
id_data
;
std
::
vector
<
int64_
t
>
id_data
;
std
::
vector
<
T
>
score_data
;
for
(
size_t
src_idx
=
0
;
src_idx
<
src_num
;
++
src_idx
)
{
...
...
@@ -117,9 +117,9 @@ struct BeamSearchDecoder {
*
(
id_tensor
->
mutable_lod
())
=
lod
;
id_tensor
->
Resize
({
static_cast
<
int64_t
>
(
id_data
.
size
())});
auto
id_ptr
=
id_tensor
->
mutable_data
<
floa
t
>
();
auto
id_ptr
=
id_tensor
->
mutable_data
<
int64_
t
>
();
TargetCopy
(
TARGET
(
kARM
),
id_ptr
,
id_data
.
data
(),
id_data
.
size
()
*
sizeof
(
floa
t
));
TARGET
(
kARM
),
id_ptr
,
id_data
.
data
(),
id_data
.
size
()
*
sizeof
(
int64_
t
));
*
(
score_tensor
->
mutable_lod
())
=
lod
;
score_tensor
->
Resize
({
static_cast
<
int64_t
>
(
score_data
.
size
())});
...
...
@@ -169,7 +169,7 @@ struct BeamSearchDecoder {
++
candidate_idx
)
{
prefix_idx_vector
.
push_back
(
prefix_idx
);
size_t
idx
=
prefix_idx_vector
.
size
()
-
1
;
auto
cur_id
=
cur_ids
.
data
<
floa
t
>
()[
candidate_idx
];
auto
cur_id
=
cur_ids
.
data
<
int64_
t
>
()[
candidate_idx
];
auto
cur_score
=
cur_scores
.
data
<
T
>
()[
candidate_idx
];
sentence_vector
.
at
(
idx
).
word_ids
.
push_back
(
cur_id
);
sentence_vector
.
at
(
idx
).
scores
.
push_back
(
cur_score
);
...
...
@@ -184,7 +184,7 @@ struct BeamSearchDecoder {
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
];
for
(
size_t
idx
=
0
;
idx
<
prefix_idx_vector
.
size
();
++
idx
)
{
auto
candidate_idx
=
prefix_idx_vector
.
at
(
idx
);
auto
cur_id
=
cur_ids
.
data
<
floa
t
>
()[
candidate_idx
];
auto
cur_id
=
cur_ids
.
data
<
int64_
t
>
()[
candidate_idx
];
auto
cur_score
=
cur_scores
.
data
<
T
>
()[
candidate_idx
];
if
(
cur_id
!=
end_id_
||
sentence_vector
.
at
(
idx
).
word_ids
.
empty
())
{
// to skip redundant end tokens
...
...
lite/kernels/arm/compare_compute.cc
浏览文件 @
e1aab593
...
...
@@ -148,6 +148,42 @@ void CompareCompute_int32<Functor>::Run() {
}
}
template
<
template
<
typename
T
>
class
Functor
>
void
CompareCompute_int64
<
Functor
>::
Run
()
{
auto
&
param
=
this
->
Param
<
operators
::
CompareParam
>
();
using
CompareFunctor
=
Functor
<
int64_t
>
;
const
size_t
x_size
=
param
.
X
->
numel
();
const
size_t
y_size
=
param
.
Y
->
numel
();
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
bool
*
z
=
param
.
Out
->
template
mutable_data
<
bool
>();
const
auto
*
x
=
param
.
X
->
template
data
<
int64_t
>();
const
auto
*
y
=
param
.
Y
->
template
data
<
int64_t
>();
auto
axis
=
param
.
axis
;
bool
force_cpu
=
param
.
force_cpu
;
if
(
x_size
==
y_size
)
{
for
(
int
i
=
0
;
i
<
x_size
;
++
i
)
{
z
[
i
]
=
CompareFunctor
()(
x
[
i
],
y
[
i
]);
}
}
else
{
int
axis
=
(
param
.
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
param
.
axis
);
int
outer_num
,
mid_num
,
inner_num
;
get_mid_dims
(
x_dims
,
y_dims
,
axis
,
&
outer_num
,
&
mid_num
,
&
inner_num
);
for
(
int
outer_id
=
0
;
outer_id
<
outer_num
;
++
outer_id
)
{
for
(
int
mid_id
=
0
;
mid_id
<
mid_num
;
++
mid_id
)
{
auto
y_data
=
y
[
mid_id
];
for
(
int
inner_id
=
0
;
inner_id
<
inner_num
;
++
inner_id
)
{
int
index
=
(
outer_id
*
mid_num
+
mid_id
)
*
inner_num
+
inner_id
;
z
[
index
]
=
CompareFunctor
()(
x
[
index
],
y_data
);
// z[index] = x[index] < y_data;
}
}
}
}
}
}
// namespace arm
}
// namespace kernels
}
// namespace lite
...
...
@@ -164,6 +200,17 @@ REGISTER_LITE_KERNEL(less_than,
.
BindInput
(
"Y"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kBool
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
less_than
,
kARM
,
kInt64
,
kNCHW
,
paddle
::
lite
::
kernels
::
arm
::
CompareCompute_int64
<
paddle
::
lite
::
kernels
::
arm
::
_LessThanFunctor
>
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt64
))})
.
BindInput
(
"Y"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt64
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kBool
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
equal
,
kARM
,
kFloat
,
...
...
lite/kernels/arm/compare_compute.h
浏览文件 @
e1aab593
...
...
@@ -46,6 +46,17 @@ class CompareCompute_int32
~
CompareCompute_int32
()
{}
};
template
<
template
<
typename
T
>
class
Functor
>
class
CompareCompute_int64
:
public
KernelLite
<
TARGET
(
kARM
),
PRECISION
(
kInt64
)
>
{
public:
using
param_t
=
operators
::
LogicalParam
;
void
Run
()
override
;
~
CompareCompute_int64
()
{}
};
}
// namespace arm
}
// namespace kernels
}
// namespace lite
...
...
lite/kernels/arm/elementwise_compute.cc
浏览文件 @
e1aab593
...
...
@@ -80,7 +80,11 @@ void ElementwiseAddCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_add_broadcast
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_add_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
...
...
@@ -99,7 +103,15 @@ void ElementwiseAddActivationCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_add_relu_broadcast
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
LOG
(
FATAL
)
<<
"unsupported Activation type: "
<<
act_type
;
}
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_add_relu_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
...
...
@@ -125,6 +137,9 @@ void ElementwiseSubCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
LOG
(
FATAL
)
<<
"elewise div don't support x_dims size < y_dims size"
;
}
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_sub_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
...
...
@@ -143,6 +158,9 @@ void ElementwiseSubActivationCompute::Run() {
std
::
string
act_type
=
param
.
act_type
;
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
LOG
(
FATAL
)
<<
"elewise div don't support x_dims size < y_dims size"
;
}
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
...
...
@@ -164,19 +182,29 @@ void ElementwiseSubActivationCompute::Run() {
template
<
typename
T
,
PrecisionType
PType
>
void
ElementwiseMulCompute
<
T
,
PType
>::
Run
()
{
auto
&
param
=
this
->
template
Param
<
operators
::
ElementwiseParam
>();
auto
*
x_data
=
param
.
X
->
template
data
<
T
>();
auto
*
y_data
=
param
.
Y
->
template
data
<
T
>();
auto
*
out_data
=
param
.
Out
->
template
mutable_data
<
T
>();
int
axis
=
param
.
axis
;
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_mul_broadcast
<
T
>
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
if
(
param
.
X
->
precision
()
==
PRECISION
(
kFloat
))
{
auto
*
x_data
=
param
.
X
->
template
data
<
float
>();
auto
*
y_data
=
param
.
Y
->
template
data
<
float
>();
auto
*
out_data
=
param
.
Out
->
template
mutable_data
<
float
>();
int
axis
=
param
.
axis
;
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_mul_broadcast
<
float
>
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_mul_broadcast
<
float
>
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
lite
::
arm
::
math
::
elementwise_mul
<
float
>
(
x_data
,
y_data
,
out_data
,
x_dims
.
production
());
}
}
else
if
(
param
.
X
->
precision
()
==
PRECISION
(
kInt64
))
{
lite
::
arm
::
math
::
elementwise_compute_basic
<
int64_t
>
(
param
,
"mul"
,
""
);
}
else
{
lite
::
arm
::
math
::
elementwise_mul
<
T
>
(
x_data
,
y_data
,
out_data
,
x_dims
.
production
());
LOG
(
FATAL
)
<<
"unsupport input type"
;
}
}
...
...
@@ -190,7 +218,15 @@ void ElementwiseMulActivationCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_mul_relu_broadcast
<
float
>
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
LOG
(
FATAL
)
<<
"unsupported Activation type: "
<<
act_type
;
}
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_mul_relu_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
...
...
@@ -216,7 +252,11 @@ void ElementwiseMaxCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_max_broadcast
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_max_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
...
...
@@ -235,7 +275,15 @@ void ElementwiseMaxActivationCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
x_dims
.
size
()
<
y_dims
.
size
()
&&
is_broadcast
(
y_dims
,
x_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_max_relu_broadcast
<
float
>
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
);
}
else
{
LOG
(
FATAL
)
<<
"unsupported Activation type: "
<<
act_type
;
}
}
else
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
lite
::
arm
::
math
::
elementwise_max_relu_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
...
...
@@ -261,6 +309,9 @@ void ElementwiseDivCompute::Run() {
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
int
pre
,
n
,
post
;
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
LOG
(
FATAL
)
<<
"elewise div don't support x_dims size < y_dims size"
;
}
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
lite
::
arm
::
math
::
elementwise_div_broadcast
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
);
...
...
@@ -279,6 +330,9 @@ void ElementwiseDivActivationCompute::Run() {
std
::
string
act_type
=
param
.
act_type
;
auto
x_dims
=
param
.
X
->
dims
();
auto
y_dims
=
param
.
Y
->
dims
();
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
LOG
(
FATAL
)
<<
"elewise div don't support x_dims size < y_dims size"
;
}
int
pre
,
n
,
post
;
if
(
is_broadcast
(
x_dims
,
y_dims
,
axis
,
&
pre
,
&
n
,
&
post
))
{
if
(
act_type
==
"relu"
)
{
...
...
lite/kernels/arm/fill_constant_batch_size_like_compute.cc
浏览文件 @
e1aab593
...
...
@@ -39,6 +39,12 @@ void FillConstantBatchSizeLikeCompute::Run() {
for
(
int
i
=
0
;
i
<
param
.
out
->
numel
();
i
++
)
{
data
[
i
]
=
param
.
value
;
}
}
else
if
(
param
.
dtype
==
static_cast
<
int32_t
>
(
lite
::
core
::
FluidType
::
INT64
))
{
auto
data
=
param
.
out
->
template
mutable_data
<
int64_t
>();
for
(
int
i
=
0
;
i
<
param
.
out
->
numel
();
i
++
)
{
data
[
i
]
=
param
.
value
;
}
}
else
{
LOG
(
FATAL
)
<<
"not supported dtype "
<<
param
.
dtype
;
}
...
...
lite/kernels/arm/fill_constant_compute.cc
浏览文件 @
e1aab593
...
...
@@ -39,6 +39,12 @@ void FillConstantCompute::Run() {
for
(
int
i
=
0
;
i
<
param
.
out
->
numel
();
i
++
)
{
data
[
i
]
=
param
.
value
;
}
}
else
if
(
param
.
dtype
==
static_cast
<
int32_t
>
(
lite
::
core
::
FluidType
::
INT64
))
{
auto
data
=
param
.
out
->
template
mutable_data
<
int64_t
>();
for
(
int
i
=
0
;
i
<
param
.
out
->
numel
();
i
++
)
{
data
[
i
]
=
param
.
value
;
}
}
else
{
LOG
(
FATAL
)
<<
"not supported dtype "
<<
param
.
dtype
;
}
...
...
lite/kernels/arm/increment_compute.cc
浏览文件 @
e1aab593
...
...
@@ -27,10 +27,22 @@ void IncrementCompute::Run() {
auto
&
param
=
this
->
Param
<
operators
::
IncrementParam
>
();
int
total_num
=
param
.
X
->
dims
().
production
();
const
auto
*
x_data
=
param
.
X
->
data
<
float
>
();
auto
*
o_data
=
param
.
Out
->
mutable_data
<
float
>
();
lite
::
arm
::
math
::
increment
(
x_data
,
total_num
,
param
.
step
,
o_data
,
&
ctx
);
if
(
param
.
X
->
precision
()
==
PRECISION
(
kFloat
))
{
const
auto
*
x_data
=
param
.
X
->
data
<
float
>
();
auto
*
o_data
=
param
.
Out
->
mutable_data
<
float
>
();
lite
::
arm
::
math
::
increment
(
x_data
,
total_num
,
param
.
step
,
o_data
,
&
ctx
);
}
else
if
(
param
.
X
->
precision
()
==
PRECISION
(
kInt64
))
{
const
auto
*
x_data
=
param
.
X
->
data
<
int64_t
>
();
auto
*
o_data
=
param
.
Out
->
mutable_data
<
int64_t
>
();
lite
::
arm
::
math
::
increment
(
x_data
,
total_num
,
param
.
step
,
o_data
,
&
ctx
);
}
else
if
(
param
.
X
->
precision
()
==
PRECISION
(
kInt32
))
{
const
auto
*
x_data
=
param
.
X
->
data
<
int32_t
>
();
auto
*
o_data
=
param
.
Out
->
mutable_data
<
int32_t
>
();
lite
::
arm
::
math
::
increment
(
x_data
,
total_num
,
param
.
step
,
o_data
,
&
ctx
);
}
else
{
LOG
(
FATAL
)
<<
"unsupport input type "
<<
PrecisionToStr
(
param
.
X
->
precision
());
}
}
}
// namespace arm
...
...
lite/kernels/arm/lookup_table_compute.cc
浏览文件 @
e1aab593
...
...
@@ -72,7 +72,7 @@ REGISTER_LITE_KERNEL(lookup_table,
paddle
::
lite
::
kernels
::
arm
::
LookupTableCompute
,
def
)
.
BindInput
(
"W"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"Ids"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
k
Any
))})
.
BindInput
(
"Ids"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
k
Int64
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
...
...
@@ -83,6 +83,6 @@ REGISTER_LITE_KERNEL(lookup_table_v2,
paddle
::
lite
::
kernels
::
arm
::
LookupTableCompute
,
def
)
.
BindInput
(
"W"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"Ids"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
k
Any
))})
.
BindInput
(
"Ids"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
k
Int64
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
lite/kernels/arm/topk_compute.cc
浏览文件 @
e1aab593
...
...
@@ -25,7 +25,7 @@ void TopkCompute::Run() {
auto
&
param
=
Param
<
operators
::
TopkParam
>
();
const
float
*
x_data
=
param
.
X
->
data
<
float
>
();
float
*
out_val
=
param
.
Out
->
mutable_data
<
float
>
();
int
*
out_ind
=
param
.
Indices
->
mutable_data
<
in
t
>
();
auto
out_ind
=
param
.
Indices
->
mutable_data
<
int64_
t
>
();
DDim
x_dims
=
param
.
X
->
dims
();
int
K
=
param
.
K
;
int
dim_size
=
x_dims
.
size
();
...
...
lite/operators/elementwise_ops.cc
浏览文件 @
e1aab593
...
...
@@ -13,8 +13,9 @@
// limitations under the License.
#include "lite/operators/elementwise_ops.h"
#include <algorithm>
#include <cmath>
#include "lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
operators
{
...
...
@@ -27,10 +28,59 @@ bool ElementwiseOp::CheckShape() const {
}
bool
ElementwiseOp
::
InferShape
()
const
{
CHECK_OR_FALSE
(
param_
.
X
->
dims
().
size
()
>=
param_
.
Y
->
dims
().
size
());
param_
.
Out
->
Resize
(
param_
.
X
->
dims
());
auto
out_lod
=
param_
.
Out
->
mutable_lod
();
*
out_lod
=
param_
.
X
->
lod
();
auto
x_dim
=
param_
.
X
->
dims
();
auto
y_dim
=
param_
.
Y
->
dims
();
if
(
x_dim
==
y_dim
)
{
param_
.
Out
->
Resize
(
x_dim
);
auto
out_lod
=
param_
.
Out
->
mutable_lod
();
*
out_lod
=
param_
.
X
->
lod
();
}
else
{
int
max_dim
=
(
x_dim
.
size
()
>
y_dim
.
size
()
?
x_dim
.
size
()
:
y_dim
.
size
());
int
axis
=
param_
.
axis
;
axis
=
(
axis
==
-
1
?
std
::
abs
(
static_cast
<
int
>
(
x_dim
.
size
()
-
y_dim
.
size
()))
:
axis
);
std
::
vector
<
int64_t
>
x_dims_array
(
max_dim
);
std
::
vector
<
int64_t
>
y_dims_array
(
max_dim
);
std
::
vector
<
int64_t
>
out_dims_array
(
max_dim
);
if
(
x_dim
.
size
()
>
y_dim
.
size
())
{
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
y_dims_array
[
i
]
=
1
;
}
if
(
axis
+
y_dim
.
size
()
<
max_dim
)
{
for
(
int
i
=
axis
+
y_dim
.
size
();
i
<
max_dim
;
++
i
)
{
y_dims_array
[
i
]
=
1
;
}
}
x_dims_array
=
x_dim
.
Vectorize
();
for
(
int
i
=
0
;
i
<
y_dim
.
size
();
++
i
)
{
y_dims_array
[
i
+
axis
]
=
y_dim
[
i
];
}
}
else
{
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
x_dims_array
[
i
]
=
1
;
}
if
(
axis
+
x_dim
.
size
()
<
max_dim
)
{
for
(
int
i
=
axis
+
x_dim
.
size
();
i
<
max_dim
;
++
i
)
{
x_dims_array
[
i
]
=
1
;
}
}
y_dims_array
=
y_dim
.
Vectorize
();
for
(
int
i
=
0
;
i
<
x_dim
.
size
();
++
i
)
{
x_dims_array
[
i
+
axis
]
=
x_dim
[
i
];
}
}
for
(
int
i
=
0
;
i
<
max_dim
;
i
++
)
{
if
(
x_dims_array
[
i
]
==
-
1
||
y_dims_array
[
i
]
==
-
1
)
{
out_dims_array
[
i
]
=
-
1
;
}
else
{
out_dims_array
[
i
]
=
std
::
max
(
x_dims_array
[
i
],
y_dims_array
[
i
]);
}
}
param_
.
Out
->
Resize
(
DDim
(
out_dims_array
));
auto
out_lod
=
param_
.
Out
->
mutable_lod
();
*
out_lod
=
param_
.
X
->
lod
();
}
return
true
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录