Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
25d2d959
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
25d2d959
编写于
6月 19, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix opencl default lws calculation bug to support low-end SOCs.
上级
649f1819
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
67 addition
and
38 deletion
+67
-38
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+1
-1
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+16
-4
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+9
-2
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+1
-4
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+2
-2
mace/kernels/opencl/conv_2d.cc
mace/kernels/opencl/conv_2d.cc
+3
-3
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+4
-2
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+4
-2
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+4
-2
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+1
-1
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+8
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+2
-1
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+3
-2
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+3
-2
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+4
-2
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+2
-2
未找到文件。
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
25d2d959
...
...
@@ -56,7 +56,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
nbytes
,
nullptr
,
&
error
);
if
(
error
!=
CL_SUCCESS
)
{
LOG
(
WARNING
)
<<
"Allocate OpenCL Buffer with "
<<
nbytes
<<
" bytes failed because of"
<<
nbytes
<<
" bytes failed because of
"
<<
OpenCLErrorToString
(
error
);
delete
buffer
;
*
result
=
nullptr
;
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
25d2d959
...
...
@@ -371,7 +371,8 @@ OpenCLRuntime::OpenCLRuntime():
}
cl_int
err
;
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
)
{
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
OpenCLVersion
::
CL_VER_2_0
)
{
std
::
vector
<
cl_context_properties
>
context_properties
;
context_properties
.
reserve
(
5
);
GetAdrenoContextProperties
(
&
context_properties
,
...
...
@@ -698,7 +699,7 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
bool
OpenCLRuntime
::
IsNonUniformWorkgroupsSupported
()
const
{
return
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
"2.0"
);
opencl_version_
==
OpenCLVersion
::
CL_VER_2_0
);
}
GPUType
OpenCLRuntime
::
gpu_type
()
const
{
...
...
@@ -709,13 +710,24 @@ const std::string OpenCLRuntime::platform_info() const {
return
platform_info_
;
}
const
std
::
string
OpenCLRuntime
::
ParseDeviceVersion
(
OpenCLVersion
OpenCLRuntime
::
ParseDeviceVersion
(
const
std
::
string
&
device_version
)
{
// OpenCL Device version string format:
// OpenCL<space><major_version.minor_version><space>
// <vendor-specific information>
auto
words
=
Split
(
device_version
,
' '
);
return
words
[
1
];
if
(
words
[
1
]
==
"2.0"
)
{
return
OpenCLVersion
::
CL_VER_2_0
;
}
else
if
(
words
[
1
]
==
"1.2"
)
{
return
OpenCLVersion
::
CL_VER_1_2
;
}
else
if
(
words
[
1
]
==
"1.1"
)
{
return
OpenCLVersion
::
CL_VER_1_1
;
}
else
if
(
words
[
1
]
==
"1.0"
)
{
return
OpenCLVersion
::
CL_VER_1_0
;
}
else
{
LOG
(
FATAL
)
<<
"Do not support OpenCL version: "
<<
words
[
1
];
return
OpenCLVersion
::
CL_VER_1_0
;
}
}
bool
OpenCLRuntime
::
IsOutOfRangeCheckEnabled
()
const
{
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
25d2d959
...
...
@@ -38,6 +38,13 @@ enum GPUType {
UNKNOWN
,
};
enum
OpenCLVersion
{
CL_VER_1_0
,
CL_VER_1_1
,
CL_VER_1_2
,
CL_VER_2_0
,
};
const
std
::
string
OpenCLErrorToString
(
cl_int
error
);
...
...
@@ -113,7 +120,7 @@ class OpenCLRuntime {
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
cl
::
Program
*
program
);
const
std
::
string
ParseDeviceVersion
(
const
std
::
string
&
device_version
);
OpenCLVersion
ParseDeviceVersion
(
const
std
::
string
&
device_version
);
private:
std
::
unique_ptr
<
KVStorage
>
precompiled_binary_storage_
;
...
...
@@ -127,7 +134,7 @@ class OpenCLRuntime {
std
::
map
<
std
::
string
,
cl
::
Program
>
built_program_map_
;
std
::
mutex
program_build_mutex_
;
std
::
string
platform_info_
;
std
::
string
opencl_version_
;
OpenCLVersion
opencl_version_
;
std
::
string
precompiled_binary_platform_info_
;
std
::
string
cached_binary_platform_info_
;
bool
out_of_range_check_
;
...
...
mace/kernels/opencl/batch_norm.cc
浏览文件 @
25d2d959
...
...
@@ -118,10 +118,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input
->
shape
();
}
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size_
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
4
,
kwg_size_
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size_
/
(
lws
[
1
]
*
lws
[
0
]));
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
25d2d959
...
...
@@ -25,11 +25,11 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
)
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
m
in
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
m
ax
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/conv_2d.cc
浏览文件 @
25d2d959
...
...
@@ -80,8 +80,8 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
static
const
Conv2dOpenclFunction
selector
[
3
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
};
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
...
...
@@ -113,7 +113,7 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
&
output_image_shape
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
5
&&
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
3
&&
selector
[
kernel_h
-
1
]
!=
nullptr
)
{
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
return
conv2d_func
(
...
...
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
25d2d959
...
...
@@ -29,7 +29,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
...
...
@@ -48,7 +49,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
25d2d959
...
...
@@ -30,7 +30,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
);
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
),
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
...
...
@@ -42,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
25d2d959
...
...
@@ -32,7 +32,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
const
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
...
...
@@ -51,7 +52,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws
[
2
]
=
base
;
}
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
25d2d959
...
...
@@ -144,7 +144,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
*
kwg_size
)
;
std
::
string
tuning_key
=
Concat
(
"deconv2d_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
25d2d959
...
...
@@ -27,24 +27,26 @@ const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
min_lws0
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
min_lws0
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
min_lws0
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
min_lws0
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
min_lws0
),
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
),
kwg_size
/
lws
[
1
]);
}
}
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
lws
[
0
],
1
);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
25d2d959
...
...
@@ -252,7 +252,8 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
);
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/pooling.cc
浏览文件 @
25d2d959
...
...
@@ -26,7 +26,7 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
)
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
...
...
@@ -35,7 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
);
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
25d2d959
...
...
@@ -26,7 +26,7 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
)
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
...
...
@@ -42,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
),
1
);
return
lws
;
}
...
...
mace/kernels/opencl/softmax.cc
浏览文件 @
25d2d959
...
...
@@ -26,7 +26,7 @@ namespace {
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
uint32_t
base
=
std
::
max
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
1
)
;
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
...
...
@@ -35,7 +35,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
lws
[
0
]
=
gws
[
0
]
/
base
;
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
lws
[
2
]
=
std
::
max
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
])),
1
);
return
lws
;
}
...
...
mace/ops/buffer_to_image_test.cc
浏览文件 @
25d2d959
...
...
@@ -136,7 +136,7 @@ TEST(BufferToImageTest, WeightWidthMedium) {
TEST
(
BufferToImageTest
,
WeightWidthLarge
)
{
TestBidirectionTransform
<
DeviceType
::
GPU
,
float
>
(
kernels
::
WEIGHT_WIDTH
,
{
64
,
128
,
11
,
13
});
{
64
,
64
,
11
,
13
});
}
TEST
(
BufferToImageTest
,
WeightHeightSmall
)
{
...
...
@@ -151,7 +151,7 @@ TEST(BufferToImageTest, WeightHeightMedium) {
TEST
(
BufferToImageTest
,
WeightHeightLarge
)
{
TestBidirectionTransform
<
DeviceType
::
GPU
,
float
>
(
kernels
::
WEIGHT_HEIGHT
,
{
64
,
32
,
11
,
13
});
{
64
,
16
,
11
,
13
});
}
namespace
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录