Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
db3ad39f
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
db3ad39f
编写于
4月 12, 2018
作者:
吴
吴承辉
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'share_buffer' into 'master'
Share tmp buffer among ops See merge request !379
上级
a0a7849e
76521e98
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
216 addition
and
84 deletion
+216
-84
mace/core/buffer.h
mace/core/buffer.h
+73
-19
mace/core/workspace.cc
mace/core/workspace.cc
+11
-0
mace/core/workspace.h
mace/core/workspace.h
+5
-1
mace/kernels/arm/conv_2d.cc
mace/kernels/arm/conv_2d.cc
+114
-53
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+9
-9
mace/ops/conv_2d.h
mace/ops/conv_2d.h
+2
-1
mace/ops/fused_conv_2d.h
mace/ops/fused_conv_2d.h
+2
-1
未找到文件。
mace/core/buffer.h
浏览文件 @
db3ad39f
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#define MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include <vector>
#include <vector>
#include <algorithm>
#include <functional>
#include <functional>
#include "mace/core/allocator.h"
#include "mace/core/allocator.h"
...
@@ -161,12 +162,10 @@ class Buffer : public BufferBase {
...
@@ -161,12 +162,10 @@ class Buffer : public BufferBase {
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
void
Clear
()
{
void
Clear
()
{
if
(
buf_
!=
nullptr
)
{
memset
(
reinterpret_cast
<
char
*>
(
raw_mutable_data
()),
0
,
size_
);
memset
(
buf_
,
0
,
size_
);
}
}
}
pr
ivate
:
pr
otected
:
Allocator
*
allocator_
;
Allocator
*
allocator_
;
void
*
buf_
;
void
*
buf_
;
void
*
mapped_buf_
;
void
*
mapped_buf_
;
...
@@ -267,19 +266,23 @@ class Image : public BufferBase {
...
@@ -267,19 +266,23 @@ class Image : public BufferBase {
class
BufferSlice
:
public
BufferBase
{
class
BufferSlice
:
public
BufferBase
{
public:
public:
BufferSlice
()
BufferSlice
()
:
buffer_
(
nullptr
),
mapped_buf_
(
nullptr
),
offset_
(
0
),
length
_
(
0
)
{}
:
BufferBase
(
0
),
buffer_
(
nullptr
),
mapped_buf_
(
nullptr
),
offset
_
(
0
)
{}
BufferSlice
(
BufferBase
*
buffer
,
index_t
offset
,
index_t
length
)
BufferSlice
(
BufferBase
*
buffer
,
index_t
offset
,
index_t
length
)
:
BufferBase
(
buffer
->
size
()
),
:
BufferBase
(
length
),
buffer_
(
buffer
),
buffer_
(
buffer
),
mapped_buf_
(
nullptr
),
mapped_buf_
(
nullptr
),
offset_
(
offset
),
offset_
(
offset
)
{
length_
(
length
)
{
MACE_CHECK
(
offset
>=
0
,
"buffer slice offset should >= 0"
);
MACE_CHECK
(
offset
>=
0
,
"buffer slice offset should >= 0"
);
MACE_CHECK
(
offset
+
length
<=
size_
,
"buffer slice offset + length ("
,
MACE_CHECK
(
offset
+
length
<=
buffer
->
size
(),
offset
,
" + "
,
length
,
") should <= "
,
size_
);
"buffer slice offset + length ("
,
offset
,
" + "
,
length
,
") should <= "
,
buffer
->
size
());
}
}
BufferSlice
(
const
BufferSlice
&
other
)
BufferSlice
(
const
BufferSlice
&
other
)
:
BufferSlice
(
other
.
buffer_
,
other
.
offset_
,
other
.
length
_
)
{}
:
BufferSlice
(
other
.
buffer_
,
other
.
offset_
,
other
.
size
_
)
{}
~
BufferSlice
()
{
~
BufferSlice
()
{
if
(
buffer_
!=
nullptr
&&
mapped_buf_
!=
nullptr
)
{
if
(
buffer_
!=
nullptr
&&
mapped_buf_
!=
nullptr
)
{
...
@@ -303,8 +306,13 @@ class BufferSlice : public BufferBase {
...
@@ -303,8 +306,13 @@ class BufferSlice : public BufferBase {
}
}
void
*
raw_mutable_data
()
{
void
*
raw_mutable_data
()
{
MACE_NOT_IMPLEMENTED
;
if
(
OnHost
())
{
return
nullptr
;
MACE_CHECK_NOTNULL
(
buffer_
);
return
reinterpret_cast
<
char
*>
(
buffer_
->
raw_mutable_data
())
+
offset_
;
}
else
{
MACE_CHECK_NOTNULL
(
mapped_buf_
);
return
mapped_buf_
;
}
}
}
void
*
Map
(
index_t
offset
,
index_t
length
,
std
::
vector
<
size_t
>
*
pitch
)
const
{
void
*
Map
(
index_t
offset
,
index_t
length
,
std
::
vector
<
size_t
>
*
pitch
)
const
{
...
@@ -317,7 +325,7 @@ class BufferSlice : public BufferBase {
...
@@ -317,7 +325,7 @@ class BufferSlice : public BufferBase {
void
Map
(
std
::
vector
<
size_t
>
*
pitch
)
{
void
Map
(
std
::
vector
<
size_t
>
*
pitch
)
{
MACE_CHECK_NOTNULL
(
buffer_
);
MACE_CHECK_NOTNULL
(
buffer_
);
MACE_CHECK
(
mapped_buf_
==
nullptr
,
"mapped buf is not null"
);
MACE_CHECK
(
mapped_buf_
==
nullptr
,
"mapped buf is not null"
);
mapped_buf_
=
buffer_
->
Map
(
offset_
,
length
_
,
pitch
);
mapped_buf_
=
buffer_
->
Map
(
offset_
,
size
_
,
pitch
);
}
}
void
UnMap
()
{
void
UnMap
()
{
...
@@ -326,7 +334,10 @@ class BufferSlice : public BufferBase {
...
@@ -326,7 +334,10 @@ class BufferSlice : public BufferBase {
mapped_buf_
=
nullptr
;
mapped_buf_
=
nullptr
;
}
}
void
Resize
(
index_t
size
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Resize
(
index_t
size
)
{
MACE_CHECK
(
size
==
size_
,
"resize buffer slice from "
,
size_
,
" to "
,
size
,
" is illegal"
);
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
...
@@ -335,15 +346,58 @@ class BufferSlice : public BufferBase {
...
@@ -335,15 +346,58 @@ class BufferSlice : public BufferBase {
bool
OnHost
()
const
{
return
buffer_
->
OnHost
();
}
bool
OnHost
()
const
{
return
buffer_
->
OnHost
();
}
void
Clear
()
{
void
Clear
()
{
MACE_NOT_IMPLEMENTED
;
memset
(
raw_mutable_data
(),
0
,
size_
)
;
}
}
private:
private:
BufferBase
*
buffer_
;
BufferBase
*
buffer_
;
void
*
mapped_buf_
;
void
*
mapped_buf_
;
index_t
offset_
;
index_t
offset_
;
index_t
length_
;
};
};
class
ScratchBuffer
:
public
Buffer
{
public:
explicit
ScratchBuffer
(
Allocator
*
allocator
)
:
Buffer
(
allocator
),
offset_
(
0
)
{}
ScratchBuffer
(
Allocator
*
allocator
,
index_t
size
)
:
Buffer
(
allocator
,
size
),
offset_
(
0
)
{}
ScratchBuffer
(
Allocator
*
allocator
,
void
*
data
,
index_t
size
)
:
Buffer
(
allocator
,
data
,
size
),
offset_
(
0
)
{}
virtual
~
ScratchBuffer
()
{}
void
GrowSize
(
index_t
size
)
{
if
(
size
>
size_
)
{
Resize
(
size
);
}
}
BufferSlice
Scratch
(
index_t
size
)
{
MACE_CHECK
(
offset_
+
size
<=
size_
,
"scratch size not enough: "
,
offset_
,
" + "
,
size
,
" > "
,
size_
);
BufferSlice
slice
(
this
,
offset_
,
size
);
offset_
+=
size
;
return
slice
;
}
void
Rewind
()
{
offset_
=
0
;
}
private:
index_t
offset_
;
};
}
// namespace mace
}
// namespace mace
#endif // MACE_CORE_BUFFER_H_
#endif // MACE_CORE_BUFFER_H_
mace/core/workspace.cc
浏览文件 @
db3ad39f
...
@@ -12,6 +12,9 @@
...
@@ -12,6 +12,9 @@
namespace
mace
{
namespace
mace
{
Workspace
::
Workspace
()
:
host_scratch_buffer_
(
new
ScratchBuffer
(
GetDeviceAllocator
(
DeviceType
::
CPU
)))
{}
Tensor
*
Workspace
::
CreateTensor
(
const
std
::
string
&
name
,
Tensor
*
Workspace
::
CreateTensor
(
const
std
::
string
&
name
,
Allocator
*
alloc
,
Allocator
*
alloc
,
DataType
type
)
{
DataType
type
)
{
...
@@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
...
@@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
}
}
}
ScratchBuffer
*
Workspace
::
GetScratchBuffer
(
DeviceType
device_type
)
{
if
(
device_type
==
CPU
||
device_type
==
NEON
)
{
return
host_scratch_buffer_
.
get
();
}
else
{
return
nullptr
;
}
}
}
// namespace mace
}
// namespace mace
mace/core/workspace.h
浏览文件 @
db3ad39f
...
@@ -20,7 +20,7 @@ class Workspace {
...
@@ -20,7 +20,7 @@ class Workspace {
public:
public:
typedef
std
::
map
<
std
::
string
,
std
::
unique_ptr
<
Tensor
>>
TensorMap
;
typedef
std
::
map
<
std
::
string
,
std
::
unique_ptr
<
Tensor
>>
TensorMap
;
Workspace
()
{}
Workspace
()
;
~
Workspace
()
{}
~
Workspace
()
{}
Tensor
*
CreateTensor
(
const
std
::
string
&
name
,
Tensor
*
CreateTensor
(
const
std
::
string
&
name
,
...
@@ -39,6 +39,8 @@ class Workspace {
...
@@ -39,6 +39,8 @@ class Workspace {
void
LoadModelTensor
(
const
NetDef
&
net_def
,
DeviceType
type
);
void
LoadModelTensor
(
const
NetDef
&
net_def
,
DeviceType
type
);
ScratchBuffer
*
GetScratchBuffer
(
DeviceType
device_type
);
private:
private:
void
CreateImageOutputTensor
(
const
NetDef
&
net_def
);
void
CreateImageOutputTensor
(
const
NetDef
&
net_def
);
...
@@ -48,6 +50,8 @@ class Workspace {
...
@@ -48,6 +50,8 @@ class Workspace {
PreallocatedPooledAllocator
preallocated_allocator_
;
PreallocatedPooledAllocator
preallocated_allocator_
;
std
::
unique_ptr
<
ScratchBuffer
>
host_scratch_buffer_
;
DISABLE_COPY_AND_ASSIGN
(
Workspace
);
DISABLE_COPY_AND_ASSIGN
(
Workspace
);
};
};
...
...
mace/kernels/arm/conv_2d.cc
浏览文件 @
db3ad39f
...
@@ -154,17 +154,28 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -154,17 +154,28 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
int
pad_left
=
paddings
[
1
]
>>
1
;
int
pad_left
=
paddings
[
1
]
>>
1
;
int
pad_right
=
paddings
[
1
]
-
pad_left
;
int
pad_right
=
paddings
[
1
]
-
pad_left
;
std
::
function
<
void
(
const
float
*
input
,
float
*
output
)
>
conv_func
;
auto
input_data
=
input
->
data
<
float
>
();
auto
input_data
=
input
->
data
<
float
>
();
auto
filter_data
=
filter
->
data
<
float
>
();
auto
filter_data
=
filter
->
data
<
float
>
();
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
float
>
();
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
float
>
();
auto
output_data
=
output
->
mutable_data
<
float
>
();
auto
output_data
=
output
->
mutable_data
<
float
>
();
if
(
USE_WINOGRAD
&&
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
std
::
function
<
void
(
const
float
*
input
,
float
*
output
)
>
conv_func
;
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
bool
use_winograd
=
USE_WINOGRAD
&&
filter_h
==
3
&&
filter_w
==
3
&&
input_channels
>=
8
&&
channels
>=
8
)
{
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
&&
input_channels
>=
8
&&
channels
>=
8
;
bool
use_neon_3x3_s1
=
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_3x3_s2
=
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
2
&&
stride_w
==
2
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_1x1_s1
=
filter_h
==
1
&&
filter_w
==
1
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
;
std
::
vector
<
index_t
>
transformed_input_shape
;
std
::
vector
<
index_t
>
transformed_output_shape
;
std
::
vector
<
index_t
>
transformed_filter_shape
;
if
(
use_winograd
)
{
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
WINOGRAD_OUT_TILE_SIZE
);
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
WINOGRAD_OUT_TILE_SIZE
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
WINOGRAD_OUT_TILE_SIZE
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
WINOGRAD_OUT_TILE_SIZE
);
...
@@ -181,12 +192,90 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -181,12 +192,90 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
index_t
tile_count
=
tile_height_count
*
tile_width_count
;
index_t
tile_count
=
tile_height_count
*
tile_width_count
;
index_t
in_tile_area
=
index_t
in_tile_area
=
(
WINOGRAD_OUT_TILE_SIZE
+
2
)
*
(
WINOGRAD_OUT_TILE_SIZE
+
2
);
(
WINOGRAD_OUT_TILE_SIZE
+
2
)
*
(
WINOGRAD_OUT_TILE_SIZE
+
2
);
transformed_input_
.
Resize
({
in_tile_area
,
batch
,
input_channels
,
transformed_input_shape
.
insert
(
transformed_input_shape
.
end
(),
{
in_tile_area
,
batch
,
input_channels
,
tile_count
});
transformed_output_shape
.
insert
(
transformed_output_shape
.
end
(),
{
in_tile_area
,
batch
,
channels
,
tile_count
});
tile_count
});
transformed_filter_
.
Resize
({
in_tile_area
,
channels
,
input_channels
});
transformed_filter_shape
.
insert
(
transformed_filter_shape
.
end
(),
transformed_output_
.
Resize
({
in_tile_area
,
batch
,
channels
,
tile_count
});
{
in_tile_area
,
channels
,
input_channels
});
}
else
if
(
use_neon_3x3_s1
)
{
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
2
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_input_width
=
std
::
max
(
padded_input_width
,
extra_output_width
+
2
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
}
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
}
else
if
(
use_neon_3x3_s2
)
{
extra_output_height
=
height
;
extra_input_height
=
std
::
max
(
padded_input_height
,
(
extra_output_height
-
1
)
*
2
+
3
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_input_width
=
std
::
max
(
padded_input_width
,
(
extra_output_width
-
1
)
*
2
+
3
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
}
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
}
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
// decide scratch size before allocate it
index_t
total_scratch_size
=
0
;
index_t
transformed_input_size
=
0
;
index_t
transformed_output_size
=
0
;
index_t
padded_input_size
=
0
;
index_t
padded_output_size
=
0
;
if
(
use_winograd
)
{
transformed_input_size
=
std
::
accumulate
(
transformed_input_shape
.
begin
(),
transformed_input_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
())
*
sizeof
(
float
);
transformed_output_size
=
std
::
accumulate
(
transformed_output_shape
.
begin
(),
transformed_output_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
())
*
sizeof
(
float
);
total_scratch_size
+=
transformed_input_size
+
transformed_output_size
;
}
if
(
extra_input_height
!=
input_height
||
extra_input_width
!=
input_width
)
{
padded_input_size
=
batch
*
input_channels
*
(
input_height
+
pad_top
+
pad_bottom
)
*
(
input_width
+
pad_left
+
pad_right
)
*
sizeof
(
float
);
total_scratch_size
+=
padded_input_size
;
}
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
padded_output_size
=
batch
*
channels
*
extra_output_height
*
extra_output_width
*
sizeof
(
float
);
total_scratch_size
+=
padded_output_size
;
}
// Init scratch buffer
scratch_
->
Rewind
();
scratch_
->
GrowSize
(
total_scratch_size
);
Tensor
transformed_input
(
scratch_
->
Scratch
(
transformed_input_size
),
DT_FLOAT
);
Tensor
transformed_output
(
scratch_
->
Scratch
(
transformed_output_size
),
DT_FLOAT
);
Tensor
padded_input
(
scratch_
->
Scratch
(
padded_input_size
),
DT_FLOAT
);
Tensor
padded_output
(
scratch_
->
Scratch
(
padded_output_size
),
DT_FLOAT
);
// decide which convolution function to call
if
(
use_winograd
)
{
transformed_input
.
Resize
(
transformed_input_shape
);
transformed_output
.
Resize
(
transformed_output_shape
);
if
(
!
is_filter_transformed_
)
{
transformed_filter_
.
Resize
(
transformed_filter_shape
);
}
conv_func
=
[
&
](
const
float
*
pad_input
,
float
*
pad_output
)
{
WinoGradConv3x3s1
(
pad_input
,
WinoGradConv3x3s1
(
pad_input
,
filter_data
,
filter_data
,
batch
,
batch
,
...
@@ -195,26 +284,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -195,26 +284,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
input_channels
,
input_channels
,
channels
,
channels
,
WINOGRAD_OUT_TILE_SIZE
,
WINOGRAD_OUT_TILE_SIZE
,
transformed_input
_
.
mutable_data
<
float
>
(),
transformed_input
.
mutable_data
<
float
>
(),
transformed_filter_
.
mutable_data
<
float
>
(),
transformed_filter_
.
mutable_data
<
float
>
(),
transformed_output
_
.
mutable_data
<
float
>
(),
transformed_output
.
mutable_data
<
float
>
(),
is_filter_transformed_
,
is_filter_transformed_
,
pad_output
);
pad_output
);
is_filter_transformed_
=
true
;
is_filter_transformed_
=
true
;
};
};
}
else
if
(
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
&&
stride_w
==
1
}
else
if
(
use_neon_3x3_s1
)
{
&&
dilation_h
==
1
&&
dilation_w
==
1
)
{
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
2
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_input_width
=
std
::
max
(
padded_input_width
,
extra_output_width
+
2
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
}
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK3x3S1
(
pad_input
,
Conv2dNeonK3x3S1
(
pad_input
,
filter_data
,
filter_data
,
...
@@ -227,21 +304,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -227,21 +304,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
channels
,
channels
,
pad_output
);
pad_output
);
};
};
}
else
if
(
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
2
&&
stride_w
==
2
}
else
if
(
use_neon_3x3_s2
)
{
&&
dilation_h
==
1
&&
dilation_w
==
1
)
{
extra_output_height
=
height
;
extra_input_height
=
std
::
max
(
padded_input_height
,
(
extra_output_height
-
1
)
*
2
+
3
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_input_width
=
std
::
max
(
padded_input_width
,
(
extra_output_width
-
1
)
*
2
+
3
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
}
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK3x3S2
(
pad_input
,
Conv2dNeonK3x3S2
(
pad_input
,
filter_data
,
filter_data
,
...
@@ -254,8 +317,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -254,8 +317,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
channels
,
channels
,
pad_output
);
pad_output
);
};
};
}
else
if
(
filter_h
==
1
&&
filter_w
==
1
&&
stride_h
==
1
&&
stride_w
==
1
}
else
if
(
use_neon_1x1_s1
)
{
&&
dilation_h
==
1
&&
dilation_w
==
1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK1x1S1
(
input_data
,
Conv2dNeonK1x1S1
(
input_data
,
filter_data
,
filter_data
,
...
@@ -287,28 +349,27 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -287,28 +349,27 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
};
};
}
}
// pad input and output
const
Tensor
*
pad_input_ptr
=
input
;
const
Tensor
*
pad_input_ptr
=
input
;
// Keep this alive during kernel execution
if
(
extra_input_height
!=
input_height
||
extra_input_width
!=
input_width
)
{
if
(
extra_input_height
!=
input_height
||
extra_input_width
!=
input_width
)
{
padded_input
.
Clear
();
ConstructNCHWInputWithSpecificPadding
(
input
,
ConstructNCHWInputWithSpecificPadding
(
input
,
pad_top
,
pad_top
,
pad_bottom
,
pad_bottom
,
pad_left
,
pad_left
,
pad_right
,
pad_right
,
&
padded_input
_
);
&
padded_input
);
pad_input_ptr
=
&
padded_input
_
;
pad_input_ptr
=
&
padded_input
;
}
}
const
float
*
pad_input_data
=
pad_input_ptr
->
data
<
float
>
();
Tensor
*
pad_output_ptr
=
output
;
Tensor
*
pad_output_ptr
=
output
;
// Keep this alive during kernel execution
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
std
::
vector
<
index_t
>
extra_output_shape
padded_output
.
Resize
({
batch
,
channels
,
extra_output_height
,
{
batch
,
channels
,
extra_output_height
,
extra_output_width
};
extra_output_width
});
padded_output_
.
Resize
(
extra_output_shape
);
padded_output
.
Clear
();
padded_output_
.
Clear
();
pad_output_ptr
=
&
padded_output
;
pad_output_ptr
=
&
padded_output_
;
}
}
const
float
*
pad_input_data
=
pad_input_ptr
->
data
<
float
>
();
float
*
pad_output_data
=
pad_output_ptr
->
mutable_data
<
float
>
();
float
*
pad_output_data
=
pad_output_ptr
->
mutable_data
<
float
>
();
conv_func
(
pad_input_data
,
pad_output_data
);
conv_func
(
pad_input_data
,
pad_output_data
);
...
...
mace/kernels/conv_2d.h
浏览文件 @
db3ad39f
...
@@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
...
@@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
paddings
,
const
int
*
dilations
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
:
Conv2dFunctorBase
(
strides
,
padding_type
,
padding_type
,
paddings
,
paddings
,
...
@@ -422,14 +423,16 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
...
@@ -422,14 +423,16 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
paddings
,
const
int
*
dilations
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
:
Conv2dFunctorBase
(
strides
,
padding_type
,
padding_type
,
paddings
,
paddings
,
dilations
,
dilations
,
activation
,
activation
,
relux_max_limit
),
relux_max_limit
),
is_filter_transformed_
(
false
)
{}
is_filter_transformed_
(
false
),
scratch_
(
scratch
)
{}
void
operator
()(
const
Tensor
*
input
,
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
...
@@ -437,13 +440,9 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
...
@@ -437,13 +440,9 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
// TODO(liyin): share tmp buffers among ops
Tensor
padded_input_
;
Tensor
padded_output_
;
Tensor
transformed_input_
;
Tensor
transformed_filter_
;
Tensor
transformed_filter_
;
Tensor
transformed_output_
;
bool
is_filter_transformed_
;
bool
is_filter_transformed_
;
ScratchBuffer
*
scratch_
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
@@ -453,7 +452,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
...
@@ -453,7 +452,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
paddings
,
const
int
*
dilations
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
const
float
relux_max_limit
,
ScratchBuffer
*
scratch
)
:
Conv2dFunctorBase
(
strides
,
:
Conv2dFunctorBase
(
strides
,
padding_type
,
padding_type
,
paddings
,
paddings
,
...
...
mace/ops/conv_2d.h
浏览文件 @
db3ad39f
...
@@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
...
@@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this
->
paddings_
,
this
->
paddings_
,
this
->
dilations_
.
data
(),
this
->
dilations_
.
data
(),
kernels
::
ActivationType
::
NOOP
,
kernels
::
ActivationType
::
NOOP
,
0.0
f
)
{}
0.0
f
,
ws
->
GetScratchBuffer
(
D
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
mace/ops/fused_conv_2d.h
浏览文件 @
db3ad39f
...
@@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
...
@@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
kernels
::
StringToActivationType
(
kernels
::
StringToActivationType
(
OperatorBase
::
GetSingleArgument
<
std
::
string
>
(
"activation"
,
OperatorBase
::
GetSingleArgument
<
std
::
string
>
(
"activation"
,
"NOOP"
)),
"NOOP"
)),
OperatorBase
::
GetSingleArgument
<
float
>
(
"max_limit"
,
0.0
f
))
{}
OperatorBase
::
GetSingleArgument
<
float
>
(
"max_limit"
,
0.0
f
),
ws
->
GetScratchBuffer
(
D
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录