提交 909f89d9 编写于 作者: 叶剑武

Merge branch 'cpu' into 'master'

Support to run on local PC.

See merge request !232
......@@ -51,3 +51,11 @@ config_setting(
},
visibility = ["//visibility:public"],
)
config_setting(
name = "openmp_enabled",
define_values = {
"openmp": "true",
},
visibility = ["//visibility:public"],
)
......@@ -42,9 +42,8 @@ cc_library(
"runtime/opencl/*.h",
"runtime/hexagon/*.h",
]),
linkopts = if_android([
linkopts = ["-ldl",] + if_android([
"-pie",
"-ldl",
"-lm",
]),
deps = [
......
......@@ -94,7 +94,6 @@ class Operator : public OperatorBase {
for (const string &output_str : operator_def.output()) {
if (ws->HasTensor(output_str)) {
Tensor *found_tensor = ws->GetTensor(output_str);
outputs_.push_back(ws->GetTensor(output_str));
} else {
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
......
......@@ -158,7 +158,6 @@ bool HexagonControlWrapper::TeardownGraph() {
void HexagonControlWrapper::PrintLog() {
char *buf;
unsigned char *p;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
LOG(INFO) << string(buf);
......@@ -168,7 +167,6 @@ void HexagonControlWrapper::PrintLog() {
void HexagonControlWrapper::PrintGraph() {
LOG(INFO) << "Print Graph";
char *buf;
unsigned char *p;
if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
LOG(INFO) << string(buf);
......
# Examples
load("//mace:mace.bzl", "if_android", "if_neon_enabled")
load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_binary(
name = "helloworld",
srcs = [
"helloworld.cc",
],
linkopts = if_neon_enabled(["-fopenmp"]),
linkopts = if_openmp_enabled(["-fopenmp"]),
deps = [
"//mace/core",
"//mace/ops",
......@@ -17,7 +17,7 @@ cc_test(
name = "benchmark_example",
testonly = 1,
srcs = ["benchmark_example.cc"],
linkopts = if_neon_enabled(["-fopenmp"]),
linkopts = if_openmp_enabled(["-fopenmp"]),
linkstatic = 1,
deps = [
"//mace/core",
......
......@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_neon_enabled")
load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_library(
name = "kernels",
......@@ -23,7 +23,7 @@ cc_library(
]) + if_neon_enabled(glob([
"neon/*.h",
])),
copts = if_neon_enabled(["-fopenmp"]),
copts = if_openmp_enabled(["-fopenmp"]),
linkopts = if_android(["-lm"]),
deps = [
"//mace/core",
......
......@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase {
}
}
index_t pos = 0;
#pragma omp parallel for
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) {
index_t pos = (((n * height) + h) * width + w) * channels + c;
if (folded_constant_) {
output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
} else {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
}
++pos;
}
}
}
......
......@@ -33,14 +33,13 @@ struct BiasAddFunctor {
T *output_ptr = output->mutable_data<T>();
index_t pos = 0;
#pragma omp parallel for
#pragma omp parallel for collapse(4)
for (index_t n = 0; n < batch; ++n) {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) {
index_t pos = (((n * height) + h) * width + w) * channels + c;
output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
++pos;
}
}
}
......
......@@ -94,8 +94,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
index_t kernel_size = input_channels * kernel_h * kernel_w;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
......@@ -105,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase {
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(4)
for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
const int out_idx = ((n * height + h) * width + w) * channels + c;
T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c];
*output_data = bias_channel;
output_data[out_idx] = bias_channel;
T sum = 0.0f;
const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -125,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
inw >= padded_w_start && inw < padded_w_stop,
"Out of range read from input: ", inh, ", ",
inw);
// else padding with 0:
// sum += 0;
} else {
index_t input_offset =
n * input_height * input_width * input_channels +
......@@ -138,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase {
}
}
}
*output_data += sum;
output_data++;
output_data[out_idx] += sum;
}
}
}
}
output_data = output->mutable_data<T>();
DoActivation(output_data, output_data, output->NumElements(), activation_,
relux_max_limit_, prelu_alpha_);
}
......
......@@ -105,8 +105,6 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
const index_t kernel_size = kernel_h * kernel_w;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
......@@ -116,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(4)
for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
......
......@@ -62,17 +62,17 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
int in_hb_idx = height_idx;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
// TODO (heliangliang) optimize out these muls
int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int in_hb_value = select(in_hb_idx + batch_idx,
-1,
(in_hb_idx < 0 || in_hb_idx >= in_height));
int filter_x_part1 = 0;
int in_width_idx = 0;
for (short width_idx = 0; width_idx < 3; ++width_idx) {
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + mul24(width_idx, dilation_w); \
in_width_value = in_width##i + in_width_idx; \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
......@@ -120,8 +120,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out4 = mad(in4.w, weights3, out4);
filter_x_part1 += rounded_in_ch;
in_width_idx += dilation_w;
}
filter_x_part0 += rounded_in_ch_x_3;
in_hb_idx += dilation_h;
}
}
......
......@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{4, kwg_size / 16, 4, 1},
{4, kwg_size / 28, 7, 1},
{4, kwg_size / 32, 8, 1},
{4, kwg_size / 56, 14, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
......
......@@ -76,7 +76,6 @@ struct PoolingFunctor : PoolingFunctorBase {
index_t height = output_shape[1];
index_t width = output_shape[2];
index_t channels = output_shape[3];
index_t out_image_size = height * width;
index_t input_height = input_shape[1];
index_t input_width = input_shape[2];
......@@ -97,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase {
int padded_w_start = 0 - paddings[1] / 2;
if (pooling_type_ == MAX) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(4)
for (int b = 0; b < batch; ++b) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T res = std::numeric_limits<T>::lowest();
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -115,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase {
}
}
}
*output = res;
output++;
output[out_offset] = res;
}
}
}
}
} else if (pooling_type_ == AVG) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(4)
for (int b = 0; b < batch; ++b) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
index_t out_offset = (((b * height) + h) * width + w) * channels + c;
index_t in_offset = b * in_image_size * input_channels + c;
T sum = 0;
int block_size = 0;
......@@ -142,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase {
}
}
}
*output = sum / block_size;
output++;
output[out_offset] = sum / block_size;
}
}
}
......
......@@ -29,21 +29,20 @@ struct SoftmaxFunctor {
const index_t num_classes = logits_shape.back();
#pragma omp parallel for
for (index_t i = 0; i < batch_size; ++i) {
T max_value = *logits_ptr;
const index_t pos = i * num_classes;
T max_value = logits_ptr[pos];
for (index_t c = 1; c < num_classes; ++c) {
max_value = std::max(max_value, logits_ptr[c]);
max_value = std::max(max_value, logits_ptr[pos + c]);
}
// TODO: check overflow?
T sum = 0;
std::vector<T> exp_data(num_classes);
for (index_t c = 0; c < num_classes; ++c) {
exp_data[c] = ::exp((*logits_ptr - max_value));
exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
sum += exp_data[c];
logits_ptr++;
}
for (index_t c = 0; c < num_classes; ++c) {
*output_ptr = exp_data[c] / sum;
output_ptr++;
output_ptr[pos + c] = exp_data[c] / sum;
}
}
}
......
......@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a):
"//mace:hexagon_enabled": [],
"//conditions:default": a,
})
def if_openmp_enabled(a):
return select({
"//mace:openmp_enabled": a,
"//conditions:default": [],
})
......@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_neon_enabled")
load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_library(
name = "test",
......@@ -34,7 +34,7 @@ cc_library(
["*.h"],
exclude = ["ops_test_util.h"],
),
copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
deps = [
"//mace/kernels",
],
......
......@@ -52,7 +52,7 @@ inline std::string ObfuscateString(const std::string &src,
const std::string &lookup_table) {
std::string dest;
dest.resize(src.size());
for (int i = 0; i < src.size(); i++) {
for (size_t i = 0; i < src.size(); i++) {
dest[i] = src[i] ^ lookup_table[i % lookup_table.size()];
}
return std::move(dest);
......@@ -73,7 +73,7 @@ inline std::string ObfuscateSymbol(const std::string &src) {
dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
const std::string encode_dict =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
for (int i = 1; i < src.size(); i++) {
for (size_t i = 1; i < src.size(); i++) {
char ch = src[i];
int idx;
if (ch >= '0' && ch <= '9') {
......
......@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-DMACE_DISABLE_NO_TUNING_WARNING" \
--copt="-Werror=return-type" \
--define neon=false
--define neon=false \
--define openmp=true
if [ $? -ne 0 ]; then
exit 1
......
......@@ -71,6 +71,17 @@ build_target()
$DSP_MODE_BUILD_FLAGS || exit 1
}
build_local_target()
{
BAZEL_TARGET=$1
bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
--copt="-std=c++11" \
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \
--copt="-DMACE_OBFUSCATE_LITERALS" \
--define openmp=true || exit -1
}
merge_libs()
{
CREATE_LIB_NAME=$1
......@@ -113,10 +124,17 @@ bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || e
echo "Step 3: Build libmace targets"
bazel clean
for target in ${all_targets[*]}
do
build_target ${target}
done
if [ x"${RUNTIME}" = x"local" ]; then
for target in ${all_targets[*]}
do
build_local_target ${target}
done
else
for target in ${all_targets[*]}
do
build_target ${target}
done
fi
echo "Step 4: Create mri files and generate merged libs"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册